Changeset 17475
- Timestamp:
- Apr 23, 2008, 11:35:39 AM (18 years ago)
- Location:
- trunk/Ohana/src/opihi
- Files:
-
- 1 added
- 12 edited
-
include/pcontrol.h (modified) (4 diffs)
-
pcontrol/CheckBusyJob.c (modified) (5 diffs)
-
pcontrol/CheckDoneHost.c (modified) (5 diffs)
-
pcontrol/CheckHost.c (modified) (2 diffs)
-
pcontrol/CheckRespHost.c (added)
-
pcontrol/CheckSystem.c (modified) (2 diffs)
-
pcontrol/HostOps.c (modified) (9 diffs)
-
pcontrol/KillJob.c (modified) (2 diffs)
-
pcontrol/Makefile (modified) (3 diffs)
-
pcontrol/PclientCommand.c (modified) (3 diffs)
-
pcontrol/ResetJob.c (modified) (2 diffs)
-
pcontrol/StartJob.c (modified) (5 diffs)
-
pcontrol/StopHosts.c (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
trunk/Ohana/src/opihi/include/pcontrol.h
r17419 r17475 28 28 PCONTROL_HOST_IDLE, 29 29 PCONTROL_HOST_BUSY, 30 PCONTROL_HOST_RESP, 30 31 PCONTROL_HOST_DOWN, 31 32 PCONTROL_HOST_DONE, 32 33 PCONTROL_HOST_OFF, 33 34 } HostStat; 35 36 /** host response options **/ 37 typedef enum { 38 PCONTROL_RESP_NONE, 39 PCONTROL_RESP_START_JOB, 40 PCONTROL_RESP_CHECK_BUSY_JOB, 41 PCONTROL_RESP_CHECK_DONE_HOST, 42 PCONTROL_RESP_CHECK_HOST, 43 PCONTROL_RESP_KILL_JOB, 44 PCONTROL_RESP_STOP_HOST, 45 } HostResp; 34 46 35 47 typedef enum { … … 97 109 Ptime nexttry; 98 110 IDtype HostID; 111 IOBuffer comms_buffer; 112 char *response; 113 HostResp response_state; 99 114 struct Job *job; 100 115 } Host; … … 156 171 157 172 /*** own files ***/ 173 int StartJob (Job *job, Host *host); 174 int StartJobResponse (Host *host); 175 158 176 int CheckHost (Host *host); 177 int CheckHostResponse (Host *host); 178 179 int CheckDoneHost (Host *host); 180 int CheckDoneHostResponse (Host *host); 181 182 int CheckBusyJob (Job *job, Host *host); 183 int CheckBusyJobResponse (Host *host); 184 185 int KillJob (Job *job, Host *host); 186 int KillJobResponse (Host *host); 187 159 188 int StartHost (Host *host); 160 189 int CheckIdleHost (Host *host); 161 int CheckDoneHost (Host *host);162 int CheckBusyJob (Job *job, Host *host);163 190 int CheckDoneJob (Job *job, Host *host); 164 int KillJob (Job *job, Host *host);165 int StartJob (Job *job, Host *host);166 int ResetJob (Job *job);167 191 int GetJobOutput (char *command, Host *host, IOBuffer *buffer, int Nbytes); 168 int PclientCommand (Host *host, char *command, char *response, IOBuffer *buffer);169 192 int rconnect (char *command, char *hostname, char *shell, int *stdio); 193 194 int PclientCommand (Host *host, char *command, char *response, HostResp response_state); 195 int PclientResponse (Host *host, char *response, IOBuffer *buffer); 196 197 int CheckRespHosts (float MaxDelay); 198 int CheckRespHost (Host *host); 170 199 171 200 /*** misc files ***/ … … 203 232 int DownHosts (); 204 233 int StopHost (Host *host); 234 int StopHostResponse (Host *host); 205 235 int HarvestHost (int pid); 206 236 -
trunk/Ohana/src/opihi/pcontrol/CheckBusyJob.c
r16456 r17475 4 4 int CheckBusyJob (Job *job, Host *host) { 5 5 6 int status; 7 int outstate; 8 char *p; 9 char string[64]; 10 IOBuffer buffer; 6 int status; 11 7 12 8 /* we are checking a job which is currently busy. it has been pulled from the … … 16 12 ASSERT (job, "job not set"); 17 13 ASSERT (host, "host not set"); 18 19 14 ASSERT (host == (Host *) job[0].host, "invalid host"); 20 15 ASSERT (job == (Job *) host[0].job, "invalid job"); 21 16 22 InitIOBuffer (&buffer, 0x100); 23 24 status = PclientCommand (host, "status", PCLIENT_PROMPT, &buffer); 17 status = PclientCommand (host, "status", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_BUSY_JOB); 25 18 26 19 /* check on success of pclient command */ 27 20 switch (status) { 28 21 case PCLIENT_DOWN: 29 HarvestHost (host[0].pid); 22 // free the realhost name 23 if (job[0].realhost) free (job[0].realhost); 24 job[0].realhost = NULL; 25 30 26 // unlink host & job 27 if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 31 28 job[0].host = NULL; 32 29 host[0].job = NULL; 33 if (job[0].realhost) free (job[0].realhost); 34 job[0].realhost = NULL; 30 HarvestHost (host[0].pid); 35 31 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 36 32 PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM); 37 FreeIOBuffer (&buffer);38 33 return (FALSE); 39 40 case PCLIENT_HUNG:41 // don't do anything drastic, just keep trying42 if (DEBUG || VerboseMode()) gprint (GP_ERR, "client is busy, not responding");43 PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM);44 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM);45 FreeIOBuffer (&buffer);46 return (TRUE);47 34 48 35 case PCLIENT_GOOD: 49 36 if (DEBUG || VerboseMode()) gprint (GP_ERR, "message received (CheckBusyJob)"); 50 break; 37 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 38 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 39 return (TRUE); 51 40 52 41 default: 53 42 ABORT ("unknown status for pclient command"); 54 43 } 44 } 45 46 int CheckBusyJobResponse (Host *host) { 47 48 int outstate; 49 char *p; 50 char string[64]; 51 IOBuffer *buffer; 52 Job *job; 53 54 /* job must have assigned host */ 55 ASSERT (host, "missing host"); 56 ASSERT (host[0].job, "missing job"); 57 buffer = &host[0].comms_buffer; 58 job = (Job *) host[0].job; 55 59 56 60 /** host is up, need to parse message **/ 57 p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer); 58 // this condition means the message is garbage. toss it and try again 59 if (p == NULL) { 60 PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM); 61 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 62 FreeIOBuffer (&buffer); 63 return (FALSE); 64 } 61 p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer); 62 ASSERT (p != NULL, "missing STATUS in pclient message"); 65 63 66 64 sscanf (p, "%*s %s", string); … … 71 69 PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM); 72 70 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 73 FreeIOBuffer (&buffer);74 71 return (TRUE); 75 72 } … … 79 76 if (!strcmp(string, "EXIT")) outstate = PCONTROL_JOB_EXIT; 80 77 if (!strcmp(string, "CRASH")) outstate = PCONTROL_JOB_CRASH; 81 ASSERT (outstate != PCONTROL_JOB_BUSY, " should not reach here (CheckJob)");78 ASSERT (outstate != PCONTROL_JOB_BUSY, "invalid status response (CheckBusyJobResponse)"); 82 79 83 80 /* parse the exit status and sizes of output buffers */ 84 p = memstr (buffer .buffer, "EXITST", buffer.Nbuffer);81 p = memstr (buffer[0].buffer, "EXITST", buffer[0].Nbuffer); 85 82 sscanf (p, "%*s %d", &job[0].exit_status); 86 p = memstr (buffer .buffer, "STDOUT", buffer.Nbuffer);83 p = memstr (buffer[0].buffer, "STDOUT", buffer[0].Nbuffer); 87 84 sscanf (p, "%*s %d", &job[0].stdout_size); 88 p = memstr (buffer .buffer, "STDERR", buffer.Nbuffer);85 p = memstr (buffer[0].buffer, "STDERR", buffer[0].Nbuffer); 89 86 sscanf (p, "%*s %d", &job[0].stderr_size); 90 87 … … 98 95 PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM); 99 96 PutJobSetState (job, PCONTROL_JOB_DONE, STACK_BOTTOM, outstate); 100 gettimeofday (&job[0].stop, (void *)NULL);97 gettimeofday (&job[0].stop, NULL); 101 98 job[0].dtime = DTIME(job[0].stop, job[0].start); 102 FreeIOBuffer (&buffer);103 99 return (TRUE); 104 100 } -
trunk/Ohana/src/opihi/pcontrol/CheckDoneHost.c
r10668 r17475 5 5 6 6 int status; 7 char *p;8 IOBuffer buffer;9 7 10 8 ASSERT (host, "host not set"); 11 9 12 InitIOBuffer (&buffer, 0x100); 13 14 status = PclientCommand (host, "reset", PCLIENT_PROMPT, &buffer); 10 status = PclientCommand (host, "reset", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_DONE_HOST); 15 11 16 12 /* check on success of pclient command */ … … 18 14 case PCLIENT_DOWN: 19 15 if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 20 /* DONE host does not have an incomplete job */21 16 HarvestHost (host[0].pid); 22 17 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 23 FreeIOBuffer (&buffer);24 18 return (FALSE); 19 /* DONE host does not have an incomplete job */ 25 20 // XXX do we need to close the connection? 26 21 27 case PCLIENT_HUNG:28 // don't do anything drastic, just try again later29 PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM);30 if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);31 FreeIOBuffer (&buffer);32 return (FALSE);33 34 22 case PCLIENT_GOOD: 35 if (VerboseMode()) gprint (GP_ERR, "message received (CheckDoneHost)\n"); 36 break; 23 if (VerboseMode()) gprint (GP_ERR, "checking done host %s\n", host[0].hostname); 24 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 25 return (TRUE); 37 26 38 27 default: 39 28 ABORT ("unknown status for pclient command"); 40 29 } 30 ABORT ("should not reach here (CheckDoneHost)"); 31 } 32 33 int CheckDoneHostResponse (Host *host) { 34 35 int status; 36 char *p; 37 IOBuffer *buffer; 38 39 /* job must have assigned host */ 40 ASSERT (host, "missing host"); 41 ASSERT (host[0].job, "missing job"); 42 buffer = &host[0].comms_buffer; 41 43 42 44 /** successful command, examine result **/ 43 p = memstr (buffer .buffer, "STATUS", buffer.Nbuffer);45 p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer); 44 46 ASSERT (p != NULL, "missing STATUS in pclient message (CheckDoneHost)"); 45 47 … … 52 54 if (DEBUG || VerboseMode()) gprint (GP_ERR, "reset failed\n"); 53 55 PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM); 54 FreeIOBuffer (&buffer);55 56 return (FALSE); 56 57 … … 59 60 if (DEBUG || VerboseMode()) gprint (GP_ERR, "successful reset\n"); 60 61 PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM); 61 FreeIOBuffer (&buffer);62 62 return (FALSE); 63 63 … … 67 67 ABORT ("should not reach here (CheckDoneHost)"); 68 68 } 69 70 /** probably need to flush the buffer before the command **/71 /** need to add timeout check here **/ -
trunk/Ohana/src/opihi/pcontrol/CheckHost.c
r10652 r17475 1 1 # include "pcontrol.h" 2 2 3 // if the host has a job, we skip it (down or crash state will be caught elsewhere) 4 // in fact, just touch the IDLE hosts, not the BUSY hosts? 3 5 int CheckHost (Host *host) { 4 6 5 7 int status; 6 IOBuffer buffer;7 8 8 9 ASSERT (host, "host not set"); … … 18 19 } 19 20 20 InitIOBuffer (&buffer, 0x100); 21 // the argument to echo (OK) is the expected response below in CheckHostResponse 22 status = PclientCommand (host, "echo OK", PCLIENT_PROMPT, PCONTROL_RESP_CHECK_HOST); 21 23 22 status = PclientCommand (host, "echo OK", PCLIENT_PROMPT, &buffer);23 24 switch (status) { 24 case 0:25 case PCLIENT_DOWN: 25 26 if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 26 27 HarvestHost (host[0].pid); 27 28 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 28 FreeIOBuffer (&buffer);29 29 return (FALSE); 30 30 31 case -1: 32 if (VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname); 33 /*** do we mark this in some way (HUNG) ? ***/ 34 PutHost (host, host[0].stack, STACK_BOTTOM); 35 FreeIOBuffer (&buffer); 36 return (FALSE); 31 case PCLIENT_GOOD: 32 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 33 return (TRUE); 37 34 38 35 default: 39 PutHost (host, host[0].stack, STACK_BOTTOM); 40 FreeIOBuffer (&buffer); 41 return (TRUE); 36 ABORT ("unknown status for pclient command"); 42 37 } 43 ABORT ("should not reach here (Check Host)");38 ABORT ("should not reach here (CheckHost)"); 44 39 } 45 40 46 // if the host has a job, we skip it (down or crash state will be caught elsewhere) 47 // in fact, just touch the IDLE hosts, not the BUSY hosts? 41 int CheckHostResponse (Host *host) { 42 43 IOBuffer *buffer; 44 45 /* we only check IDLE hosts without jobs */ 46 ASSERT (host, "missing host"); 47 buffer = &host[0].comms_buffer; 48 49 // XXX check on the value of the response? (OK) 50 51 PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM); 52 return (TRUE); 53 } -
trunk/Ohana/src/opihi/pcontrol/CheckSystem.c
r16589 r17475 99 99 100 100 if (RunLevel != PCONTROL_RUN_NONE) { 101 Nhostchecks += CheckRespHosts(0.020); /* check for incoming messages */ 102 TestCheckPoint (); 101 103 Nhostchecks += CheckDoneHosts(0.020); /* reset the host */ 102 104 TestCheckPoint (); … … 258 260 } 259 261 262 int CheckRespHosts (float MaxDelay) { 263 264 struct timeval start, stop; 265 int i, Nobject; 266 Stack *stack; 267 Host *host; 268 float dtime; 269 270 /* Loop through objects on the stack, no more than once. see note above */ 271 stack = GetHostStack (PCONTROL_HOST_RESP); 272 Nobject = stack[0].Nobject; 273 274 /* always allow at least one test */ 275 gettimeofday (&start, (void *) NULL); 276 dtime = 0.0; 277 for (i = 0; (i < Nobject) && (dtime < MaxDelay); i++) { 278 host = PullStackByLocation (stack, STACK_TOP); 279 if (host == NULL) break; 280 CheckRespHost (host); 281 gettimeofday (&stop, (void *) NULL); 282 dtime = DTIME (stop, start); 283 } 284 if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i); 285 return (i); 286 } 287 260 288 int CheckDoneHosts (float MaxDelay) { 261 289 -
trunk/Ohana/src/opihi/pcontrol/HostOps.c
r16472 r17475 1 1 # include "pcontrol.h" 2 2 3 Stack *HostPool_Idle; 4 Stack *HostPool_Busy; 5 Stack *HostPool_Done; 6 Stack *HostPool_Down; 7 Stack *HostPool_Off; 3 Stack *HostPool_Idle; // these hosts are waiting for something to do 4 Stack *HostPool_Busy; // these hosts are working 5 Stack *HostPool_Resp; // these hosts are trying to respond 6 Stack *HostPool_Done; // these hosts have finished a job 7 Stack *HostPool_Down; // these hosts are not responding 8 Stack *HostPool_Off; // these hosts are off 8 9 9 10 void InitHostStacks () { 10 11 HostPool_Idle = InitStack (); 11 12 HostPool_Busy = InitStack (); 13 HostPool_Resp = InitStack (); 12 14 HostPool_Done = InitStack (); 13 15 HostPool_Down = InitStack (); … … 26 28 FreeHostStack (HostPool_Idle); 27 29 FreeHostStack (HostPool_Busy); 30 FreeHostStack (HostPool_Resp); 28 31 FreeHostStack (HostPool_Done); 29 32 FreeHostStack (HostPool_Down); … … 35 38 case PCONTROL_HOST_IDLE: return ("IDLE"); 36 39 case PCONTROL_HOST_DOWN: return ("DOWN"); 40 case PCONTROL_HOST_RESP: return ("RESP"); 37 41 case PCONTROL_HOST_DONE: return ("DONE"); 38 42 case PCONTROL_HOST_BUSY: return ("BUSY"); … … 48 52 case PCONTROL_HOST_IDLE: return (HostPool_Idle); 49 53 case PCONTROL_HOST_DOWN: return (HostPool_Down); 54 case PCONTROL_HOST_RESP: return (HostPool_Resp); 50 55 case PCONTROL_HOST_DONE: return (HostPool_Done); 51 56 case PCONTROL_HOST_BUSY: return (HostPool_Busy); … … 60 65 if (!strcasecmp (name, "idle")) return (HostPool_Idle); 61 66 if (!strcasecmp (name, "down")) return (HostPool_Down); 67 if (!strcasecmp (name, "resp")) return (HostPool_Resp); 62 68 if (!strcasecmp (name, "done")) return (HostPool_Done); 63 69 if (!strcasecmp (name, "busy")) return (HostPool_Busy); … … 94 100 if (host != NULL) return (host); 95 101 102 *StackID = PCONTROL_HOST_RESP; 103 host = PullHostFromStackByID (*StackID, HostID); 104 if (host != NULL) return (host); 105 96 106 *StackID = PCONTROL_HOST_DONE; 97 107 host = PullHostFromStackByID (*StackID, HostID); … … 120 130 121 131 *StackID = PCONTROL_HOST_DOWN; 132 host = PullHostFromStackByName (*StackID, name); 133 if (host != NULL) return (host); 134 135 *StackID = PCONTROL_HOST_RESP; 122 136 host = PullHostFromStackByName (*StackID, name); 123 137 if (host != NULL) return (host); … … 180 194 host[0].nexttry.tv_usec = 0; 181 195 196 InitIOBuffer (&host[0].comms_buffer, 0x100); 197 host[0].response_state = PCONTROL_RESP_NONE; 198 host[0].response = NULL; 199 182 200 host[0].markoff = FALSE; 183 201 host[0].job = NULL; … … 187 205 188 206 void DelHost (Host *host) { 207 FreeIOBuffer (&host[0].comms_buffer); 189 208 FREE (host[0].hostname); 190 209 FREE (host[0].job); -
trunk/Ohana/src/opihi/pcontrol/KillJob.c
r10661 r17475 3 3 int KillJob (Job *job, Host *host) { 4 4 5 IOBuffer buffer;6 5 int status; 7 char *p;8 6 9 7 ASSERT (host != NULL, "host missing"); 10 8 ASSERT (job != NULL, "job missing"); 11 12 9 ASSERT (host == (Host *) job[0].host, "invalid host"); 13 10 ASSERT (job == (Job *) host[0].job, "invalid job"); 14 11 15 InitIOBuffer (&buffer, 0x100); 16 17 status = PclientCommand (host, "reset", PCLIENT_PROMPT, &buffer); 12 status = PclientCommand (host, "reset", PCLIENT_PROMPT, PCONTROL_RESP_KILL_JOB); 18 13 19 14 /* check on success of pclient command */ 20 15 switch (status) { 21 16 case PCLIENT_DOWN: 22 HarvestHost (host[0].pid);23 17 // unlink host & job 18 if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 24 19 job[0].host = NULL; 25 20 host[0].job = NULL; 21 HarvestHost (host[0].pid); 26 22 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 27 23 PutJob (job, PCONTROL_JOB_CRASH, STACK_BOTTOM); 28 FreeIOBuffer (&buffer);29 24 return (FALSE); 30 25 31 case PCLIENT_ HUNG:32 // don't do anything drastic, just keep trying33 // XXX move to which stack??34 gprint (GP_ERR, "client is busy, not responding (KillJob)");35 FreeIOBuffer (&buffer);26 case PCLIENT_GOOD: 27 if (VerboseMode()) gprint (GP_ERR, "kill job on host %s\n", host[0].hostname); 28 FlushIOBuffer (&host[0].comms_buffer); 29 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 30 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 36 31 return (TRUE); 37 38 case PCLIENT_GOOD:39 if (VerboseMode()) gprint (GP_ERR, "message received (KillJob)\n");40 break;41 32 42 33 default: 43 34 ABORT ("unknown status for pclient command"); 44 35 } 36 } 45 37 46 /** host is up, need to parse message **/ 47 p = memstr (buffer.buffer, "STATUS", buffer.Nbuffer); 38 int KillJobResponse (Host *host) { 39 40 int status; 41 char *p; 42 IOBuffer *buffer; 43 Job *job; 44 45 ASSERT (host != NULL, "host missing"); 46 ASSERT (host[0].job, "missing job"); 47 buffer = &host[0].comms_buffer; 48 job = (Job *) host[0].job; 49 50 /** check on response to pclient command **/ 51 p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer); 48 52 ASSERT (p != NULL, "missing STATUS in pclient message"); 49 if (VerboseMode()) gprint (GP_ERR, "client message: %s\n", buffer .buffer);53 if (VerboseMode()) gprint (GP_ERR, "client message: %s\n", buffer[0].buffer); 50 54 51 55 sscanf (p, "%*s %d", &status); 52 FreeIOBuffer (&buffer);53 56 gprint (GP_ERR, "client status: %d\n", status); 54 57 … … 62 65 return (FALSE); 63 66 case 1: 64 gprint (GP_ERR, "kill ingjob %s on %s\n", job[0].argv[0], host[0].hostname);67 gprint (GP_ERR, "killed job %s on %s\n", job[0].argv[0], host[0].hostname); 65 68 // unlink host & job 66 69 job[0].host = NULL; -
trunk/Ohana/src/opihi/pcontrol/Makefile
r12842 r17475 13 13 LIBS1 = -lkapa -lFITS -lohana 14 14 LIBS2 = -lbasiccmd -lshell -ldata 15 FULL_CFLAGS = $(BASE_CFLAGS) 15 FULL_CFLAGS = $(BASE_CFLAGS) -Wall -Werror 16 16 FULL_CPPFLAGS = $(BASE_CPPFLAGS) 17 17 FULL_LDFLAGS = $(LIBS1) $(LIBS2) $(BASE_LDFLAGS) … … 27 27 $(SRC)/CheckBusyJob.$(ARCH).o \ 28 28 $(SRC)/CheckDoneHost.$(ARCH).o \ 29 $(SRC)/CheckRespHost.$(ARCH).o \ 29 30 $(SRC)/CheckDoneJob.$(ARCH).o \ 30 31 $(SRC)/CheckHost.$(ARCH).o \ … … 38 39 $(SRC)/StackOps.$(ARCH).o \ 39 40 $(SRC)/PclientCommand.$(ARCH).o \ 40 $(SRC)/ResetJob.$(ARCH).o \41 41 $(SRC)/StartHost.$(ARCH).o \ 42 42 $(SRC)/StopHosts.$(ARCH).o \ -
trunk/Ohana/src/opihi/pcontrol/PclientCommand.c
r17474 r17475 1 1 # include "pcontrol.h" 2 # define PCLIENT_TIMEOUT 200002 # define PCLIENT_TIMEOUT 100 3 3 4 int PclientCommand (Host *host, char *command, char *response, IOBuffer *buffer) { 4 // send a command and check for errors; ignore output 5 int PclientCommand (Host *host, char *command, char *response, HostResp response_state) { 5 6 6 int i;7 7 int status; 8 char *line; 9 struct timespec request, remain; 8 IOBuffer buffer; 10 9 11 10 ASSERT (host != NULL, "host missing"); 12 ASSERT (buffer != NULL, "buffer missing");13 11 ASSERT (command != NULL, "command missing"); 14 ASSERT (response != NULL, "response missing");15 16 /* avoid blocking on read, test every 100 usec, up to 2.0 sec */17 request.tv_sec = 0;18 request.tv_nsec = 100000;19 12 20 13 // flush the stdout and stderr buffers here 21 ReadtoIOBuffer ( buffer, host[0].stdout_fd);22 FlushIOBuffer ( buffer);23 ReadtoIOBuffer ( buffer, host[0].stderr_fd);24 FlushIOBuffer ( buffer);14 ReadtoIOBuffer (&buffer, host[0].stdout_fd); 15 FlushIOBuffer (&buffer); 16 ReadtoIOBuffer (&buffer, host[0].stderr_fd); 17 FlushIOBuffer (&buffer); 25 18 26 19 /* send command to client (adding on \n) */ … … 33 26 } 34 27 28 // prepare host to accept response 29 host[0].response_state = response_state; 30 host[0].response = response; 31 FlushIOBuffer (&host[0].comms_buffer); 32 33 return (PCLIENT_GOOD); 34 } 35 36 // check for response; message must end with specified string. 37 // accumulate the response in the buffer 38 int PclientResponse (Host *host, char *response, IOBuffer *buffer) { 39 40 int i; 41 int status; 42 char *line; 43 struct timespec request, remain; 44 45 ASSERT (response != NULL, "response missing"); 46 ASSERT (buffer != NULL, "buffer missing"); 47 48 /* avoid blocking very long on read, test every 100 usec, up to 0.1 sec */ 49 request.tv_sec = 0; 50 request.tv_nsec = 100000; 51 35 52 /* watch for response - wait up to 1 second */ 36 53 line = NULL; 37 54 status = -1; 38 55 56 // how long does each cycle really take? 39 57 for (i = 0; (i < PCLIENT_TIMEOUT) && (status != 0) && (line == NULL); i++) { 40 58 status = ReadtoIOBuffer (buffer, host[0].stdout_fd); … … 46 64 return (PCLIENT_DOWN); 47 65 } 66 if (line == NULL) return (PCLIENT_HUNG); 48 67 if (status == -1) return (PCLIENT_HUNG); 49 if (line == NULL) return (PCLIENT_HUNG);50 68 51 69 // fprintf (stderr, "buffer.buffer: %s\n", buffer[0].buffer); 70 71 // we have detected a valid response, clear the response data 72 host[0].response_state = PCONTROL_RESP_NONE; 73 host[0].response = NULL; 74 52 75 return (PCLIENT_GOOD); 53 76 } -
trunk/Ohana/src/opihi/pcontrol/ResetJob.c
r16472 r17475 1 1 # include "pcontrol.h" 2 3 // XXX deprecated 2 4 3 5 int ResetJob (Job *job) { 4 6 5 7 int status; 6 IOBuffer buffer;7 8 Host *host; 8 9 … … 13 14 ASSERT (job != NULL, "host missing"); 14 15 15 InitIOBuffer (&buffer, 0x100);16 17 16 /* we have tried to reset the job; may not get status */ 18 17 job[0].Reset = TRUE; 19 18 20 status = PclientCommand (host, "reset" , PCLIENT_PROMPT, &buffer);19 status = PclientCommand (host, "reset"); 21 20 22 21 /* check on success of pclient command */ 23 22 switch (status) { 24 23 case PCLIENT_DOWN: 25 /*** different behavior for ANYHOST, WANTHOST, NEEDHOST? ***/ 26 gprint (GP_ERR, "host %s is down\n", host[0].hostname); 24 if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 27 25 HarvestHost (host[0].pid); 28 26 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 29 FreeIOBuffer (&buffer);30 return (FALSE);31 32 case PCLIENT_HUNG:33 /*** should we consider a HUNG host DOWN? ***/34 gprint (GP_ERR, "host %s is not responding (ResetJob)\n", host[0].hostname);35 FreeIOBuffer (&buffer);36 27 return (FALSE); 37 28 38 29 case PCLIENT_GOOD: 39 gprint (GP_ERR, "message received (ResetJob)\n"); 40 FreeIOBuffer (&buffer); 30 host[0].response_state = PCONTROL_RESP_RESET_JOB; 31 host[0].response = PCLIENT_PROMPT; 32 FlushIOBuffer (&host[0].comms_buffer, 0x100); 33 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 41 34 return (TRUE); 42 35 43 36 default: 44 gprint (GP_ERR, "unknown status for pclient command: programming error\n"); 45 pcontrol_exit (55); 37 ABORT ("unknown status for pclient command"); 46 38 } 39 ABORT ("should not reach here (ResetJob)"); 40 } 47 41 48 gprint (GP_ERR, "programming error in ResetJob (should not reach here)\n"); 49 FreeIOBuffer (&buffer); 50 pcontrol_exit (56); 51 return (FALSE); 42 int ResetJobResponse (Host *host) { 43 44 int status; 45 IOBuffer *buffer; 46 47 /* job must have assigned host */ 48 ASSERT (host, "missing host"); 49 ASSERT (host[0].job, "missing job"); 50 buffer = host[0].comms_buffer; 51 52 gprint (GP_ERR, "message received (ResetJob)\n"); 53 return (TRUE); 52 54 } 53 55 -
trunk/Ohana/src/opihi/pcontrol/StartJob.c
r11388 r17475 1 1 # include "pcontrol.h" 2 2 3 // job and host are bound together (why pass in both?) 3 4 int StartJob (Job *job, Host *host) { 4 5 5 6 int i, Nline, status; 6 char *line, *p; 7 IOBuffer buffer; 8 9 InitIOBuffer (&buffer, 0x100); 7 char *line; 10 8 11 9 /* job must have assigned host */ … … 28 26 } 29 27 30 status = PclientCommand (host, line, PCLIENT_PROMPT, &buffer);28 status = PclientCommand (host, line, PCLIENT_PROMPT, PCONTROL_RESP_START_JOB); 31 29 free (line); 32 30 … … 34 32 switch (status) { 35 33 case PCLIENT_DOWN: 34 // unlink host & job 36 35 if (VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); 37 goto failure;38 39 case PCLIENT_HUNG:40 // we need the job start to return a valid Job ID,41 // give up on jobs which don't get started.42 // XXX we are sensitive here to the time it takes pclient43 // to fork the job. if this is slow, the client may appear to hang.44 gprint (GP_ERR, "host %s is not responding (StartJob)\n", host[0].hostname);45 if (VerboseMode()) gprint (GP_ERR, "host %s is not responding\n", host[0].hostname);46 47 // unlink host & job48 36 job[0].host = NULL; 49 37 host[0].job = NULL; 50 if (job[0].realhost) free (job[0].realhost); 51 job[0].realhost = NULL; 52 PutHost (host, PCONTROL_HOST_DONE, STACK_BOTTOM); 38 HarvestHost (host[0].pid); 39 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 53 40 PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM); 54 FreeIOBuffer (&buffer);55 41 return (FALSE); 56 42 57 43 case PCLIENT_GOOD: 58 if (VerboseMode()) gprint (GP_ERR, "message received (StartJob)\n"); 59 break; 44 job[0].realhost = strcreate (host[0].hostname); 45 job[0].pid = -1; 46 gettimeofday (&job[0].start, (void *) NULL); 47 48 if (VerboseMode()) gprint (GP_ERR, "started job on host %s\n", host[0].hostname); 49 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 50 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 51 return (TRUE); 60 52 61 53 default: 62 54 ABORT ("unknown status for pclient command"); 63 55 } 56 } 57 58 // message has been received from the host, interpret results 59 int StartJobResponse (Host *host) { 60 61 int status; 62 char *p; 63 IOBuffer *buffer; 64 Job *job; 65 66 /* job must have assigned host */ 67 ASSERT (host, "missing host"); 68 ASSERT (host[0].job, "missing job"); 69 buffer = &host[0].comms_buffer; 70 job = (Job *) host[0].job; 64 71 65 72 /* check on result of pclient command */ 66 p = memstr (buffer .buffer, "STATUS", buffer.Nbuffer);73 p = memstr (buffer[0].buffer, "STATUS", buffer[0].Nbuffer); 67 74 ASSERT (p != NULL, "missing STATUS in pclient message"); 68 75 … … 71 78 case -1: 72 79 if (VerboseMode()) gprint (GP_ERR, "error in pclient child\n"); 73 goto failure; 80 // unlink host & job 81 job[0].host = NULL; 82 host[0].job = NULL; 83 HarvestHost (host[0].pid); 84 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 85 PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM); 86 return (FALSE); 74 87 75 88 case -2: … … 80 93 81 94 default: 82 job[0].realhost = strcreate (host[0].hostname);95 if (VerboseMode()) gprint (GP_ERR, "message received (StartJobResponse)\n"); 83 96 job[0].pid = status; 84 97 PutHost (host, PCONTROL_HOST_BUSY, STACK_BOTTOM); 85 98 PutJob (job, PCONTROL_JOB_BUSY, STACK_BOTTOM); 86 FreeIOBuffer (&buffer); 87 gettimeofday (&job[0].start, (void *) NULL); 99 gettimeofday (&job[0].start, NULL); 88 100 return (TRUE); 89 101 } 102 90 103 /* we should never reach here */ 91 104 ABORT ("should not reach here (StartJob)"); 92 93 failure:94 // unlink host & job95 job[0].host = NULL;96 host[0].job = NULL;97 HarvestHost (host[0].pid);98 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);99 PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);100 FreeIOBuffer (&buffer);101 return (FALSE);102 105 } -
trunk/Ohana/src/opihi/pcontrol/StopHosts.c
r16472 r17475 72 72 73 73 int status; 74 IOBuffer buffer;75 74 76 InitIOBuffer (&buffer, 0x100); 77 status = PclientCommand (host, "exit", "Goodbye", &buffer); 78 FreeIOBuffer (&buffer); 75 status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST); 79 76 80 77 /* check on success of pclient command */ 81 78 switch (status) { 82 79 case PCLIENT_DOWN: 83 break; 84 85 case PCLIENT_HUNG: 86 gprint (GP_ERR, "host %s is not responding\n", host[0].hostname); 80 // XXX this is the desired result in any case, so ignore it 87 81 break; 88 82 89 83 case PCLIENT_GOOD: 90 break; 84 if (VerboseMode()) gprint (GP_ERR, "stop host %s\n", host[0].hostname); 85 FlushIOBuffer (&host[0].comms_buffer); 86 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 87 return (TRUE); 91 88 92 89 default: 93 gprint (GP_ERR, "unknown status for pclient command: programming error\n"); 94 pcontrol_exit (57); 90 ABORT ("unknown status for pclient command"); 95 91 } 92 ABORT ("should not reach here"); 93 } 94 95 int StopHostResponse (Host *host) { 96 96 97 HarvestHost (host[0].pid); 97 98 return (TRUE);
Note:
See TracChangeset
for help on using the changeset viewer.
