IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 17475


Ignore:
Timestamp:
Apr 23, 2008, 11:35:39 AM (18 years ago)
Author:
eugene
Message:

split out client command from client response; allow response to return slowly

Location:
trunk/Ohana/src/opihi
Files:
1 added
12 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/include/pcontrol.h

    r17419 r17475  
    2828  PCONTROL_HOST_IDLE,
    2929  PCONTROL_HOST_BUSY, 
     30  PCONTROL_HOST_RESP,
    3031  PCONTROL_HOST_DOWN,
    3132  PCONTROL_HOST_DONE,
    3233  PCONTROL_HOST_OFF,
    3334} HostStat;
     35
     36/** host response options **/
     37typedef enum {
     38  PCONTROL_RESP_NONE,
     39  PCONTROL_RESP_START_JOB,
     40  PCONTROL_RESP_CHECK_BUSY_JOB, 
     41  PCONTROL_RESP_CHECK_DONE_HOST, 
     42  PCONTROL_RESP_CHECK_HOST,
     43  PCONTROL_RESP_KILL_JOB,
     44  PCONTROL_RESP_STOP_HOST,
     45} HostResp;
    3446
    3547typedef enum {
     
    97109  Ptime       nexttry;
    98110  IDtype      HostID;
     111  IOBuffer    comms_buffer;
     112  char       *response;
     113  HostResp    response_state;
    99114  struct Job *job;
    100115} Host;
     
    156171
    157172/*** own files ***/
     173int StartJob (Job *job, Host *host);
     174int StartJobResponse (Host *host);
     175
    158176int CheckHost (Host *host);
     177int CheckHostResponse (Host *host);
     178
     179int CheckDoneHost (Host *host);
     180int CheckDoneHostResponse (Host *host);
     181
     182int CheckBusyJob (Job *job, Host *host);
     183int CheckBusyJobResponse (Host *host);
     184
     185int KillJob (Job *job, Host *host);
     186int KillJobResponse (Host *host);
     187
    159188int StartHost (Host *host);
    160189int CheckIdleHost (Host *host);
    161 int CheckDoneHost (Host *host);
    162 int CheckBusyJob (Job *job, Host *host);
    163190int CheckDoneJob (Job *job, Host *host);
    164 int KillJob (Job *job, Host *host);
    165 int StartJob (Job *job, Host *host);
    166 int ResetJob (Job *job);
    167191int GetJobOutput (char *command, Host *host, IOBuffer *buffer, int Nbytes);
    168 int PclientCommand (Host *host, char *command, char *response, IOBuffer *buffer);
    169192int rconnect (char *command, char *hostname, char *shell, int *stdio);
     193
     194int PclientCommand (Host *host, char *command, char *response, HostResp response_state);
     195int PclientResponse (Host *host, char *response, IOBuffer *buffer);
     196
     197int CheckRespHosts (float MaxDelay);
     198int CheckRespHost (Host *host);
    170199
    171200/*** misc files ***/
     
    203232int    DownHosts ();
    204233int    StopHost (Host *host);
     234int    StopHostResponse (Host *host);
    205235int    HarvestHost (int pid);
    206236
  • trunk/Ohana/src/opihi/pcontrol/CheckBusyJob.c

    r16456 r17475  
    44int CheckBusyJob (Job *job, Host *host) {
    55
    6   int      status;
    7   int      outstate;
    8   char    *p;
    9   char     string[64];
    10   IOBuffer buffer;
     6  int status;
    117
    128  /* we are checking a job which is currently busy.  it has been pulled from the
     
    1612  ASSERT (job, "job not set");
    1713  ASSERT (host, "host not set");
    18 
    1914  ASSERT (host == (Host *) job[0].host, "invalid host");
    2015  ASSERT (job  == (Job *) host[0].job, "invalid job");
    2116
    22   InitIOBuffer (&buffer, 0x100);
    23 
    24   status = PclientCommand (host, "status", PCLIENT_PROMPT, &buffer);
     17  status = PclientCommand (host, "status", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_BUSY_JOB);
    2518
    2619  /* check on success of pclient command */
    2720  switch (status) {
    2821    case PCLIENT_DOWN:
    29       HarvestHost (host[0].pid);
     22      // free the realhost name
     23      if (job[0].realhost) free (job[0].realhost);
     24      job[0].realhost = NULL;
     25
    3026      // unlink host & job
     27      if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    3128      job[0].host = NULL;
    3229      host[0].job = NULL;
    33       if (job[0].realhost) free (job[0].realhost);
    34       job[0].realhost = NULL;
     30      HarvestHost (host[0].pid);
    3531      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    3632      PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);
    37       FreeIOBuffer (&buffer);
    3833      return (FALSE);
    39 
    40     case PCLIENT_HUNG:
    41       // don't do anything drastic, just keep trying
    42       if (DEBUG || VerboseMode()) gprint (GP_ERR, "client is busy, not responding");
    43       PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);
    44       PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
    45       FreeIOBuffer (&buffer);
    46       return (TRUE);
    4734
    4835    case PCLIENT_GOOD:
    4936      if (DEBUG || VerboseMode()) gprint (GP_ERR, "message received (CheckBusyJob)");
    50       break;
     37      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     38      PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
     39      return (TRUE);
    5140
    5241    default:
    5342      ABORT ("unknown status for pclient command"); 
    5443  }
     44}
     45
     46int CheckBusyJobResponse (Host *host) {
     47
     48  int      outstate;
     49  char    *p;
     50  char     string[64];
     51  IOBuffer *buffer;
     52  Job *job;
     53
     54  /* job must have assigned host */
     55  ASSERT (host, "missing host");
     56  ASSERT (host[0].job, "missing job");
     57  buffer = &host[0].comms_buffer;
     58  job = (Job *) host[0].job;
    5559
    5660  /** host is up, need to parse message **/
    57   p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer);
    58   // this condition means the message is garbage.  toss it and try again
    59   if (p == NULL) {
    60     PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);
    61     PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
    62     FreeIOBuffer (&buffer);
    63     return (FALSE);
    64   }
     61  p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer);
     62  ASSERT (p != NULL, "missing STATUS in pclient message");
    6563
    6664  sscanf (p, "%*s %s", string);
     
    7169    PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);
    7270    PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
    73     FreeIOBuffer (&buffer);
    7471    return (TRUE);
    7572  }
     
    7976  if (!strcmp(string, "EXIT")) outstate = PCONTROL_JOB_EXIT;
    8077  if (!strcmp(string, "CRASH")) outstate = PCONTROL_JOB_CRASH;
    81   ASSERT (outstate != PCONTROL_JOB_BUSY, "should not reach here (CheckJob)");
     78  ASSERT (outstate != PCONTROL_JOB_BUSY, "invalid status response (CheckBusyJobResponse)");
    8279
    8380  /* parse the exit status and sizes of output buffers */
    84   p = memstr (buffer.buffer, "EXITST", buffer.Nbuffer);
     81  p = memstr (buffer[0].buffer, "EXITST", buffer[0].Nbuffer);
    8582  sscanf (p, "%*s %d", &job[0].exit_status);
    86   p = memstr (buffer.buffer, "STDOUT", buffer.Nbuffer);
     83  p = memstr (buffer[0].buffer, "STDOUT", buffer[0].Nbuffer);
    8784  sscanf (p, "%*s %d", &job[0].stdout_size);
    88   p = memstr (buffer.buffer, "STDERR", buffer.Nbuffer);
     85  p = memstr (buffer[0].buffer, "STDERR", buffer[0].Nbuffer);
    8986  sscanf (p, "%*s %d", &job[0].stderr_size);
    9087
     
    9895  PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);
    9996  PutJobSetState (job, PCONTROL_JOB_DONE, STACK_BOTTOM, outstate);
    100   gettimeofday (&job[0].stop, (void *) NULL);
     97  gettimeofday (&job[0].stop, NULL);
    10198  job[0].dtime = DTIME(job[0].stop, job[0].start);
    102   FreeIOBuffer (&buffer);
    10399  return (TRUE);
    104100}
  • trunk/Ohana/src/opihi/pcontrol/CheckDoneHost.c

    r10668 r17475  
    55 
    66  int       status;
    7   char     *p;
    8   IOBuffer  buffer;
    97
    108  ASSERT (host, "host not set");
    119
    12   InitIOBuffer (&buffer, 0x100);
    13  
    14   status = PclientCommand (host, "reset", PCLIENT_PROMPT, &buffer);
     10  status = PclientCommand (host, "reset", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_DONE_HOST);
    1511
    1612  /* check on success of pclient command */
     
    1814    case PCLIENT_DOWN:
    1915      if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    20       /* DONE host does not have an incomplete job */
    2116      HarvestHost (host[0].pid);
    2217      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    23       FreeIOBuffer (&buffer);
    2418      return (FALSE);
     19      /* DONE host does not have an incomplete job */
    2520      // XXX do we need to close the connection?
    2621
    27     case PCLIENT_HUNG:
    28       // don't do anything drastic, just try again later
    29       PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM);
    30       if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);
    31       FreeIOBuffer (&buffer);
    32       return (FALSE);
    33 
    3422    case PCLIENT_GOOD:
    35       if (VerboseMode()) gprint (GP_ERR, "message received (CheckDoneHost)\n"); 
    36       break;
     23      if (VerboseMode()) gprint (GP_ERR, "checking done host %s\n", host[0].hostname); 
     24      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     25      return (TRUE);
    3726
    3827    default:
    3928      ABORT ("unknown status for pclient command"); 
    4029  }
     30  ABORT ("should not reach here (CheckDoneHost)");
     31}
     32
     33int CheckDoneHostResponse (Host *host) {
     34
     35  int status;
     36  char *p;
     37  IOBuffer *buffer;
     38
     39  /* job must have assigned host */
     40  ASSERT (host, "missing host");
     41  ASSERT (host[0].job, "missing job");
     42  buffer = &host[0].comms_buffer;
    4143
    4244  /** successful command, examine result **/
    43   p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer);
     45  p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer);
    4446  ASSERT (p != NULL, "missing STATUS in pclient message (CheckDoneHost)");
    4547
     
    5254      if (DEBUG || VerboseMode()) gprint (GP_ERR, "reset failed\n");
    5355      PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM);
    54       FreeIOBuffer (&buffer);
    5556      return (FALSE);
    5657     
     
    5960      if (DEBUG || VerboseMode()) gprint (GP_ERR, "successful reset\n");
    6061      PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
    61       FreeIOBuffer (&buffer);
    6262      return (FALSE);
    6363
     
    6767  ABORT ("should not reach here (CheckDoneHost)");
    6868}
    69 
    70 /** probably need to flush the buffer before the command **/
    71 /** need to add timeout check here **/
  • trunk/Ohana/src/opihi/pcontrol/CheckHost.c

    r10652 r17475  
    11# include "pcontrol.h"
    22
     3// if the host has a job, we skip it (down or crash state will be caught elsewhere)
     4// in fact, just touch the IDLE hosts, not the BUSY hosts?
    35int CheckHost (Host *host) {
    46 
    57  int status;
    6   IOBuffer buffer;
    78
    89  ASSERT (host, "host not set");
     
    1819  }
    1920
    20   InitIOBuffer (&buffer, 0x100);
     21  // the argument to echo (OK) is the expected response below in CheckHostResponse
     22  status = PclientCommand (host, "echo OK", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_HOST);
    2123
    22   status = PclientCommand (host, "echo OK", PCLIENT_PROMPT, &buffer);
    2324  switch (status) {
    24     case 0:
     25    case PCLIENT_DOWN:
    2526      if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    2627      HarvestHost (host[0].pid);
    2728      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    28       FreeIOBuffer (&buffer);
    2929      return (FALSE);
    3030     
    31     case -1:
    32       if (VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);
    33       /*** do we mark this in some way (HUNG) ? ***/
    34       PutHost (host, host[0].stack, STACK_BOTTOM);
    35       FreeIOBuffer (&buffer);
    36       return (FALSE);
     31    case PCLIENT_GOOD:
     32      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     33      return (TRUE);
    3734
    3835    default:
    39       PutHost (host, host[0].stack, STACK_BOTTOM);
    40       FreeIOBuffer (&buffer);
    41       return (TRUE);
     36      ABORT ("unknown status for pclient command"); 
    4237  }
    43   ABORT ("should not reach here (Check Host)");
     38  ABORT ("should not reach here (CheckHost)");
    4439}
    4540
    46 // if the host has a job, we skip it (down or crash state will be caught elsewhere)
    47 // in fact, just touch the IDLE hosts, not the BUSY hosts?
     41int CheckHostResponse (Host *host) {
     42 
     43  IOBuffer *buffer;
     44
     45  /* we only check IDLE hosts without jobs */
     46  ASSERT (host, "missing host");
     47  buffer = &host[0].comms_buffer;
     48
     49  // XXX check on the value of the response? (OK)
     50
     51  PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
     52  return (TRUE);
     53}
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r16589 r17475  
    9999
    100100    if (RunLevel != PCONTROL_RUN_NONE) {
     101      Nhostchecks += CheckRespHosts(0.020); /* check for incoming messages */
     102      TestCheckPoint ();
    101103      Nhostchecks += CheckDoneHosts(0.020); /* reset the host */
    102104      TestCheckPoint ();
     
    258260}
    259261
     262int CheckRespHosts (float MaxDelay) {
     263
     264  struct timeval start, stop;
     265  int i, Nobject;
     266  Stack *stack;
     267  Host  *host;
     268  float dtime;
     269
     270  /* Loop through objects on the stack, no more than once. see note above */
     271  stack = GetHostStack (PCONTROL_HOST_RESP);
     272  Nobject = stack[0].Nobject;
     273
     274  /* always allow at least one test */
     275  gettimeofday (&start, (void *) NULL);
     276  dtime = 0.0;
     277  for (i = 0; (i < Nobject) && (dtime < MaxDelay); i++) {
     278    host = PullStackByLocation (stack, STACK_TOP);
     279    if (host == NULL) break;
     280    CheckRespHost (host);
     281    gettimeofday (&stop, (void *) NULL);
     282    dtime = DTIME (stop, start);
     283  }
     284  if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i);
     285  return (i);
     286}
     287
    260288int CheckDoneHosts (float MaxDelay) {
    261289
  • trunk/Ohana/src/opihi/pcontrol/HostOps.c

    r16472 r17475  
    11# include "pcontrol.h"
    22
    3 Stack *HostPool_Idle;
    4 Stack *HostPool_Busy;
    5 Stack *HostPool_Done;
    6 Stack *HostPool_Down;
    7 Stack *HostPool_Off;
     3Stack *HostPool_Idle; // these hosts are waiting for something to do
     4Stack *HostPool_Busy; // these hosts are working
     5Stack *HostPool_Resp; // these hosts are trying to respond
     6Stack *HostPool_Done; // these hosts have finished a job
     7Stack *HostPool_Down; // these hosts are not responding
     8Stack *HostPool_Off;  // these hosts are off
    89
    910void InitHostStacks () {
    1011  HostPool_Idle = InitStack ();
    1112  HostPool_Busy = InitStack ();
     13  HostPool_Resp = InitStack ();
    1214  HostPool_Done = InitStack ();
    1315  HostPool_Down = InitStack ();
     
    2628  FreeHostStack (HostPool_Idle);
    2729  FreeHostStack (HostPool_Busy);
     30  FreeHostStack (HostPool_Resp);
    2831  FreeHostStack (HostPool_Done);
    2932  FreeHostStack (HostPool_Down);
     
    3538    case PCONTROL_HOST_IDLE: return ("IDLE");
    3639    case PCONTROL_HOST_DOWN: return ("DOWN");
     40    case PCONTROL_HOST_RESP: return ("RESP");
    3741    case PCONTROL_HOST_DONE: return ("DONE");
    3842    case PCONTROL_HOST_BUSY: return ("BUSY");
     
    4852    case PCONTROL_HOST_IDLE: return (HostPool_Idle);
    4953    case PCONTROL_HOST_DOWN: return (HostPool_Down);
     54    case PCONTROL_HOST_RESP: return (HostPool_Resp);
    5055    case PCONTROL_HOST_DONE: return (HostPool_Done);
    5156    case PCONTROL_HOST_BUSY: return (HostPool_Busy);
     
    6065  if (!strcasecmp (name, "idle")) return (HostPool_Idle);
    6166  if (!strcasecmp (name, "down")) return (HostPool_Down);
     67  if (!strcasecmp (name, "resp")) return (HostPool_Resp);
    6268  if (!strcasecmp (name, "done")) return (HostPool_Done);
    6369  if (!strcasecmp (name, "busy")) return (HostPool_Busy);
     
    94100  if (host != NULL) return (host);
    95101
     102  *StackID = PCONTROL_HOST_RESP;
     103  host = PullHostFromStackByID (*StackID, HostID);
     104  if (host != NULL) return (host);
     105
    96106  *StackID = PCONTROL_HOST_DONE;
    97107  host = PullHostFromStackByID (*StackID, HostID);
     
    120130
    121131  *StackID = PCONTROL_HOST_DOWN;
     132  host = PullHostFromStackByName (*StackID, name);
     133  if (host != NULL) return (host);
     134
     135  *StackID = PCONTROL_HOST_RESP;
    122136  host = PullHostFromStackByName (*StackID, name);
    123137  if (host != NULL) return (host);
     
    180194  host[0].nexttry.tv_usec = 0;
    181195
     196  InitIOBuffer (&host[0].comms_buffer, 0x100);
     197  host[0].response_state = PCONTROL_RESP_NONE;
     198  host[0].response = NULL;
     199
    182200  host[0].markoff  = FALSE;
    183201  host[0].job      = NULL;
     
    187205
    188206void DelHost (Host *host) {
     207  FreeIOBuffer (&host[0].comms_buffer);
    189208  FREE (host[0].hostname);
    190209  FREE (host[0].job);
  • trunk/Ohana/src/opihi/pcontrol/KillJob.c

    r10661 r17475  
    33int KillJob (Job *job, Host *host) {
    44 
    5   IOBuffer buffer;
    65  int status;
    7   char *p;
    86
    97  ASSERT (host != NULL, "host missing");
    108  ASSERT (job != NULL, "job missing");
    11 
    129  ASSERT (host == (Host *) job[0].host, "invalid host");
    1310  ASSERT (job  == (Job *) host[0].job, "invalid job");
    1411
    15   InitIOBuffer (&buffer, 0x100);
    16 
    17   status = PclientCommand (host, "reset", PCLIENT_PROMPT, &buffer);
     12  status = PclientCommand (host, "reset", PCLIENT_PROMPT, PCONTROL_RESP_KILL_JOB);
    1813
    1914  /* check on success of pclient command */
    2015  switch (status) {
    2116    case PCLIENT_DOWN:
    22       HarvestHost (host[0].pid);
    2317      // unlink host & job
     18      if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    2419      job[0].host = NULL;
    2520      host[0].job = NULL;
     21      HarvestHost (host[0].pid);
    2622      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    2723      PutJob (job, PCONTROL_JOB_CRASH, STACK_BOTTOM);
    28       FreeIOBuffer (&buffer);
    2924      return (FALSE);
    3025
    31     case PCLIENT_HUNG:
    32       // don't do anything drastic, just keep trying
    33       // XXX move to which stack??
    34       gprint (GP_ERR, "client is busy, not responding (KillJob)");
    35       FreeIOBuffer (&buffer);
     26    case PCLIENT_GOOD:
     27      if (VerboseMode()) gprint (GP_ERR, "kill job on host %s\n", host[0].hostname); 
     28      FlushIOBuffer (&host[0].comms_buffer);
     29      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     30      PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
    3631      return (TRUE);
    37 
    38     case PCLIENT_GOOD:
    39       if (VerboseMode()) gprint (GP_ERR, "message received (KillJob)\n"); 
    40       break;
    4132
    4233    default:
    4334      ABORT ("unknown status for pclient command"); 
    4435  }
     36}
    4537
    46   /** host is up, need to parse message **/
    47   p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer);
     38int KillJobResponse (Host *host) {
     39 
     40  int status;
     41  char *p;
     42  IOBuffer *buffer;
     43  Job *job;
     44
     45  ASSERT (host != NULL, "host missing");
     46  ASSERT (host[0].job, "missing job");
     47  buffer = &host[0].comms_buffer;
     48  job = (Job *) host[0].job;
     49
     50  /** check on response to pclient command **/
     51  p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer);
    4852  ASSERT (p != NULL, "missing STATUS in pclient message");
    49   if (VerboseMode()) gprint (GP_ERR, "client message: %s\n", buffer.buffer);
     53  if (VerboseMode()) gprint (GP_ERR, "client message: %s\n", buffer[0].buffer);
    5054
    5155  sscanf (p, "%*s %d", &status);
    52   FreeIOBuffer (&buffer);
    5356  gprint (GP_ERR, "client status: %d\n", status);
    5457
     
    6265      return (FALSE);
    6366    case 1:
    64       gprint (GP_ERR, "killing job %s on %s\n", job[0].argv[0], host[0].hostname);
     67      gprint (GP_ERR, "killed job %s on %s\n", job[0].argv[0], host[0].hostname);
    6568      // unlink host & job
    6669      job[0].host = NULL;
  • trunk/Ohana/src/opihi/pcontrol/Makefile

    r12842 r17475  
    1313LIBS1         = -lkapa -lFITS -lohana
    1414LIBS2         = -lbasiccmd -lshell -ldata
    15 FULL_CFLAGS   = $(BASE_CFLAGS)
     15FULL_CFLAGS   = $(BASE_CFLAGS) -Wall -Werror
    1616FULL_CPPFLAGS = $(BASE_CPPFLAGS)
    1717FULL_LDFLAGS  = $(LIBS1) $(LIBS2) $(BASE_LDFLAGS)
     
    2727$(SRC)/CheckBusyJob.$(ARCH).o \
    2828$(SRC)/CheckDoneHost.$(ARCH).o \
     29$(SRC)/CheckRespHost.$(ARCH).o \
    2930$(SRC)/CheckDoneJob.$(ARCH).o \
    3031$(SRC)/CheckHost.$(ARCH).o \
     
    3839$(SRC)/StackOps.$(ARCH).o \
    3940$(SRC)/PclientCommand.$(ARCH).o \
    40 $(SRC)/ResetJob.$(ARCH).o \
    4141$(SRC)/StartHost.$(ARCH).o \
    4242$(SRC)/StopHosts.$(ARCH).o \
  • trunk/Ohana/src/opihi/pcontrol/PclientCommand.c

    r17474 r17475  
    11# include "pcontrol.h"
    2 # define PCLIENT_TIMEOUT 20000
     2# define PCLIENT_TIMEOUT 100
    33
    4 int PclientCommand (Host *host, char *command, char *response, IOBuffer *buffer) {
     4// send a command and check for errors; ignore output
     5int PclientCommand (Host *host, char *command, char *response, HostResp response_state) {
    56
    6   int i;
    77  int status;
    8   char *line;
    9   struct timespec request, remain;
     8  IOBuffer buffer;
    109
    1110  ASSERT (host != NULL, "host missing");
    12   ASSERT (buffer != NULL, "buffer missing");
    1311  ASSERT (command != NULL, "command missing");
    14   ASSERT (response != NULL, "response missing");
    15 
    16   /* avoid blocking on read, test every 100 usec, up to 2.0 sec */
    17   request.tv_sec = 0;
    18   request.tv_nsec = 100000;
    1912
    2013  // flush the stdout and stderr buffers here
    21   ReadtoIOBuffer (buffer, host[0].stdout_fd);
    22   FlushIOBuffer (buffer);
    23   ReadtoIOBuffer (buffer, host[0].stderr_fd);
    24   FlushIOBuffer (buffer);
     14  ReadtoIOBuffer (&buffer, host[0].stdout_fd);
     15  FlushIOBuffer (&buffer);
     16  ReadtoIOBuffer (&buffer, host[0].stderr_fd);
     17  FlushIOBuffer (&buffer);
    2518
    2619  /* send command to client (adding on \n) */
     
    3326  }
    3427 
     28  // prepare host to accept response
     29  host[0].response_state = response_state;
     30  host[0].response = response;
     31  FlushIOBuffer (&host[0].comms_buffer);
     32
     33  return (PCLIENT_GOOD);
     34}
     35 
     36// check for response; message must end with specified string.
     37// accumulate the response in the buffer
     38int PclientResponse (Host *host, char *response, IOBuffer *buffer) {
     39
     40  int i;
     41  int status;
     42  char *line;
     43  struct timespec request, remain;
     44
     45  ASSERT (response != NULL, "response missing");
     46  ASSERT (buffer != NULL, "buffer missing");
     47
     48  /* avoid blocking very long on read, test every 100 usec, up to 0.1 sec */
     49  request.tv_sec = 0;
     50  request.tv_nsec = 100000;
     51
    3552  /* watch for response - wait up to 1 second */
    3653  line = NULL;
    3754  status = -1;
    3855
     56  // how long does each cycle really take?
    3957  for (i = 0; (i < PCLIENT_TIMEOUT) && (status != 0) && (line == NULL); i++) {
    4058    status = ReadtoIOBuffer (buffer, host[0].stdout_fd);
     
    4664    return (PCLIENT_DOWN);
    4765  }
     66  if (line == NULL) return (PCLIENT_HUNG);
    4867  if (status == -1) return (PCLIENT_HUNG);
    49   if (line == NULL) return (PCLIENT_HUNG);
    5068
    5169  // fprintf (stderr, "buffer.buffer: %s\n", buffer[0].buffer);
     70
     71  // we have detected a valid response, clear the response data
     72  host[0].response_state = PCONTROL_RESP_NONE;
     73  host[0].response = NULL;
     74
    5275  return (PCLIENT_GOOD);
    5376}
  • trunk/Ohana/src/opihi/pcontrol/ResetJob.c

    r16472 r17475  
    11# include "pcontrol.h"
     2
     3// XXX deprecated
    24
    35int ResetJob (Job *job) {
    46 
    57  int       status;
    6   IOBuffer  buffer;
    78  Host     *host;
    89
     
    1314  ASSERT (job != NULL, "host missing");
    1415
    15   InitIOBuffer (&buffer, 0x100);
    16  
    1716  /* we have tried to reset the job; may not get status */
    1817  job[0].Reset = TRUE;
    1918
    20   status = PclientCommand (host, "reset", PCLIENT_PROMPT, &buffer);
     19  status = PclientCommand (host, "reset");
    2120
    2221  /* check on success of pclient command */
    2322  switch (status) {
    2423    case PCLIENT_DOWN:
    25       /*** different behavior for ANYHOST, WANTHOST, NEEDHOST? ***/
    26       gprint (GP_ERR, "host %s is down\n", host[0].hostname);
     24      if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    2725      HarvestHost (host[0].pid);
    2826      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    29       FreeIOBuffer (&buffer);
    30       return (FALSE);
    31 
    32     case PCLIENT_HUNG:
    33       /*** should we consider a HUNG host DOWN? ***/
    34       gprint (GP_ERR, "host %s is not responding (ResetJob)\n", host[0].hostname);
    35       FreeIOBuffer (&buffer);
    3627      return (FALSE);
    3728
    3829    case PCLIENT_GOOD:
    39       gprint (GP_ERR, "message received (ResetJob)\n"); 
    40       FreeIOBuffer (&buffer);
     30      host[0].response_state = PCONTROL_RESP_RESET_JOB;
     31      host[0].response = PCLIENT_PROMPT;
     32      FlushIOBuffer (&host[0].comms_buffer, 0x100);
     33      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
    4134      return (TRUE);
    4235
    4336    default:
    44       gprint (GP_ERR, "unknown status for pclient command: programming error\n"); 
    45       pcontrol_exit (55);
     37      ABORT ("unknown status for pclient command"); 
    4638  }
     39  ABORT ("should not reach here (ResetJob)");
     40}
    4741
    48   gprint (GP_ERR, "programming error in ResetJob (should not reach here)\n");
    49   FreeIOBuffer (&buffer);
    50   pcontrol_exit (56);
    51   return (FALSE);
     42int ResetJobResponse (Host *host) {
     43 
     44  int       status;
     45  IOBuffer *buffer;
     46
     47  /* job must have assigned host */
     48  ASSERT (host, "missing host");
     49  ASSERT (host[0].job, "missing job");
     50  buffer = host[0].comms_buffer;
     51
     52  gprint (GP_ERR, "message received (ResetJob)\n"); 
     53  return (TRUE);
    5254}
    5355
  • trunk/Ohana/src/opihi/pcontrol/StartJob.c

    r11388 r17475  
    11# include "pcontrol.h"
    22
     3// job and host are bound together (why pass in both?)
    34int StartJob (Job *job, Host *host) {
    45
    56  int  i, Nline, status;
    6   char *line, *p;
    7   IOBuffer buffer;
    8 
    9   InitIOBuffer (&buffer, 0x100);
     7  char *line;
    108
    119  /* job must have assigned host */
     
    2826  }
    2927
    30   status = PclientCommand (host, line, PCLIENT_PROMPT, &buffer);
     28  status = PclientCommand (host, line, PCLIENT_PROMPT, PCONTROL_RESP_START_JOB);
    3129  free (line);
    3230
     
    3432  switch (status) {
    3533    case PCLIENT_DOWN:
     34      // unlink host & job
    3635      if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
    37       goto failure;
    38 
    39     case PCLIENT_HUNG:
    40       // we need the job start to return a valid Job ID,
    41       // give up on jobs which don't get started.
    42       // XXX we are sensitive here to the time it takes pclient
    43       // to fork the job.  if this is slow, the client may appear to hang.
    44       gprint (GP_ERR, "host %s is not responding (StartJob)\n", host[0].hostname);
    45       if (VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);
    46 
    47       // unlink host & job
    4836      job[0].host = NULL;
    4937      host[0].job = NULL;
    50       if (job[0].realhost) free (job[0].realhost);
    51       job[0].realhost = NULL;
    52       PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM);
     38      HarvestHost (host[0].pid);
     39      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    5340      PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);
    54       FreeIOBuffer (&buffer);
    5541      return (FALSE);
    5642
    5743    case PCLIENT_GOOD:
    58       if (VerboseMode()) gprint (GP_ERR, "message received (StartJob)\n"); 
    59       break;
     44      job[0].realhost = strcreate (host[0].hostname);
     45      job[0].pid = -1;
     46      gettimeofday (&job[0].start, (void *) NULL);
     47
     48      if (VerboseMode()) gprint (GP_ERR, "started job on host %s\n", host[0].hostname); 
     49      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     50      PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
     51      return (TRUE);
    6052
    6153    default:
    6254      ABORT ("unknown status for pclient command"); 
    6355  }
     56}
     57
     58// message has been received from the host, interpret results
     59int StartJobResponse (Host *host) {
     60 
     61  int status;
     62  char *p;
     63  IOBuffer *buffer;
     64  Job *job;
     65
     66  /* job must have assigned host */
     67  ASSERT (host, "missing host");
     68  ASSERT (host[0].job, "missing job");
     69  buffer = &host[0].comms_buffer;
     70  job = (Job *) host[0].job;
    6471
    6572  /* check on result of pclient command */
    66   p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer);
     73  p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer);
    6774  ASSERT (p != NULL, "missing STATUS in pclient message");
    6875
     
    7178    case -1:
    7279      if (VerboseMode()) gprint (GP_ERR, "error in pclient child\n");
    73       goto failure;
     80      // unlink host & job
     81      job[0].host = NULL;
     82      host[0].job = NULL;
     83      HarvestHost (host[0].pid);
     84      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
     85      PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);
     86      return (FALSE);
    7487
    7588    case -2:
     
    8093
    8194    default:
    82       job[0].realhost = strcreate (host[0].hostname);
     95      if (VerboseMode()) gprint (GP_ERR, "message received (StartJobResponse)\n"); 
    8396      job[0].pid = status;
    8497      PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);
    8598      PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);
    86       FreeIOBuffer (&buffer);
    87       gettimeofday (&job[0].start, (void *) NULL);
     99      gettimeofday (&job[0].start, NULL);
    88100      return (TRUE);
    89101  }
     102
    90103  /* we should never reach here */
    91104  ABORT ("should not reach here (StartJob)");
    92 
    93 failure:
    94   // unlink host & job
    95   job[0].host = NULL;
    96   host[0].job = NULL;
    97   HarvestHost (host[0].pid);
    98   PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    99   PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);
    100   FreeIOBuffer (&buffer);
    101   return (FALSE);
    102105}
  • trunk/Ohana/src/opihi/pcontrol/StopHosts.c

    r16472 r17475  
    7272
    7373  int       status;
    74   IOBuffer  buffer;
    7574
    76   InitIOBuffer (&buffer, 0x100);
    77   status = PclientCommand (host, "exit", "Goodbye", &buffer);
    78   FreeIOBuffer (&buffer);
     75  status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST);
    7976
    8077  /* check on success of pclient command */
    8178  switch (status) {
    8279    case PCLIENT_DOWN:
    83       break;
    84 
    85     case PCLIENT_HUNG:
    86       gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);
     80      // XXX this is the desired result in any case, so ignore it
    8781      break;
    8882
    8983    case PCLIENT_GOOD:
    90       break;
     84      if (VerboseMode()) gprint (GP_ERR, "stop host %s\n", host[0].hostname); 
     85      FlushIOBuffer (&host[0].comms_buffer);
     86      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     87      return (TRUE);
    9188
    9289    default:
    93       gprint (GP_ERR, "unknown status for pclient command: programming error\n"); 
    94       pcontrol_exit (57);
     90      ABORT ("unknown status for pclient command"); 
    9591  }
     92  ABORT ("should not reach here"); 
     93}
     94
     95int StopHostResponse (Host *host) {
     96
    9697  HarvestHost (host[0].pid);
    9798  return (TRUE);
Note: See TracChangeset for help on using the changeset viewer.