Changeset 25872
- Timestamp:
- Oct 18, 2009, 10:29:11 AM (17 years ago)
- Location:
- trunk/Ohana/src/opihi
- Files:
-
- 9 edited
-
include/pcontrol.h (modified) (4 diffs)
-
pcontrol/CheckHost.c (modified) (1 diff)
-
pcontrol/CheckIdleHost.c (modified) (2 diffs)
-
pcontrol/CheckRespHost.c (modified) (3 diffs)
-
pcontrol/CheckSystem.c (modified) (2 diffs)
-
pcontrol/HostOps.c (modified) (1 diff)
-
pcontrol/StartHost.c (modified) (1 diff)
-
pcontrol/StopHosts.c (modified) (3 diffs)
-
pcontrol/host.c (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
trunk/Ohana/src/opihi/include/pcontrol.h
r21379 r25872 60 60 PCONTROL_RESP_KILL_JOB, 61 61 PCONTROL_RESP_STOP_HOST, 62 PCONTROL_RESP_DOWN_HOST, 62 63 } HostResp; 63 64 … … 129 130 int pid; 130 131 HostStat stack; 131 Ptime lasttry;132 Ptime nexttry;133 132 IDtype HostID; 134 133 IOBuffer comms_buffer; 135 134 char *response; 136 135 HostResp response_state; 136 Ptime last_start_try; // last (UNIX) time we attempted to connect to this host (0 on success) 137 Ptime next_start_try; // next (UNIX) time we should attempt to connect to this host (0 on success) 138 Ptime connect_time; // (UNIX) time we connected to this host 137 139 struct Job *job; 138 140 } Host; … … 210 212 211 213 int StartHost (Host *host); 214 int CheckResetHost (Host *host); 212 215 int CheckIdleHost (Host *host, int Stage); 213 216 int CheckDoneJob (Job *job, Host *host); … … 253 256 /*** StopHosts.c ***/ 254 257 void DownHost (Host *host); 258 int DownHosts (void); 259 int DownHostResponse (Host *host); 255 260 void OffHost (Host *host); 256 int DownHosts (void); 257 int StopHost (Host *host); 261 int StopHost (Host *host, int mode); 258 262 int StopHostResponse (Host *host); 259 263 int HarvestHost (int pid); -
trunk/Ohana/src/opihi/pcontrol/CheckHost.c
r18098 r25872 14 14 if (host[0].markoff) { 15 15 host[0].markoff = FALSE; 16 StopHost (host );16 StopHost (host, PCONTROL_HOST_OFF); 17 17 return (TRUE); 18 18 } -
trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c
r23330 r25872 1 1 # include "pcontrol.h" 2 2 3 // The connection to the remote host is only allow to live for MAX_CONNECT_TIME seconds. We 4 // disconnect and reconnect if a remote host has been connected for too long. This is a 5 // (temporary?) work-around for the problem that the remote pclient job tends to grow too large 6 // over time. 7 8 # define MAX_CONNECT_TIME 36000.0 9 3 10 static FILE *logfile = NULL; 11 12 /* if this host has been connected for too long, disconnect (will automatically reconnect) */ 13 int CheckResetHost (Host *host) { 14 15 struct timeval now; 16 float dtime; 17 18 /* if this host has been connected for too long, disconnect (will automatically reconnect) */ 19 gettimeofday (&now, (void *) NULL); 20 dtime = DTIME (now, host[0].connect_time); 21 if (dtime > MAX_CONNECT_TIME) { 22 if (VerboseMode()) gprint (GP_ERR, "disconnect from %s\n", host[0].hostname); 23 StopHost (host, PCONTROL_HOST_DOWN); 24 return (TRUE); 25 } 26 return FALSE; 27 } 4 28 5 29 /* the supplied host is not on a stack: it cannot be taken by the other thread */ … … 21 45 if (host[0].markoff) { 22 46 host[0].markoff = FALSE; 23 StopHost (host );47 StopHost (host, PCONTROL_HOST_OFF); 24 48 return (TRUE); 25 49 } 26 50 51 /* check if host has been connected for too long */ 52 if (CheckResetHost (host)) { 53 return (TRUE); 54 } 55 27 56 /* search the JOB_PENDING stack for an appropriate job */ 28 57 stack = GetJobStack (PCONTROL_JOB_PENDING); -
trunk/Ohana/src/opihi/pcontrol/CheckRespHost.c
r17476 r25872 32 32 host[0].response = NULL; 33 33 34 // host has shutdown; harvest the defunct process 34 // if want the host to be shutdown, accept the result 35 if (host[0].response_state == PCONTROL_RESP_DOWN_HOST) { 36 if (DEBUG) fprintf (stderr, "PCONTROL_RESP_DOWN_HOST\n"); 37 DownHostResponse (host); 38 return TRUE; 39 } 40 if (host[0].response_state == PCONTROL_RESP_STOP_HOST) { 41 if (DEBUG) fprintf (stderr, "PCONTROL_RESP_STOP_HOST\n"); 42 StopHostResponse (host); 43 return TRUE; 44 } 45 46 // host has unexpectedly shutdown; harvest the defunct process 35 47 HarvestHost (host[0].pid); 36 48 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); … … 47 59 48 60 case PCLIENT_GOOD: 49 if (VerboseMode()) gprint (GP_ERR, "message received (CheckRespHost)\n");50 61 break; 51 62 … … 80 91 break; 81 92 93 case PCONTROL_RESP_DOWN_HOST: 94 if (DEBUG) fprintf (stderr, "PCONTROL_RESP_DOWN_HOST\n"); 95 status = DownHostResponse (host); 96 break; 97 82 98 case PCONTROL_RESP_STOP_HOST: 83 99 if (DEBUG) fprintf (stderr, "PCONTROL_RESP_STOP_HOST\n"); -
trunk/Ohana/src/opihi/pcontrol/CheckSystem.c
r21379 r25872 357 357 return (TRUE); 358 358 } 359 dtime = DTIME (host[0].next try, start);359 dtime = DTIME (host[0].next_start_try, start); 360 360 if (dtime > 0) { 361 361 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); … … 383 383 float dtime; 384 384 385 /* check if there are any pending jobs , otherwise skip step*/385 /* check if there are any pending jobs */ 386 386 stack = GetJobStack (PCONTROL_JOB_PENDING); 387 if (!stack[0].Nobject) return (0); 387 388 /* if there are no pending jobs and we are not in STAGE_NEEDHOST, skip test */ 389 if (!stack[0].Nobject && (Stage != PCONTROL_JOB_STAGE_NEEDHOST)) return (0); 390 391 /* if there are no pending jobs, check for hosts that need to be reset */ 392 if (!stack[0].Nobject) { 393 /* cycle through IDLE hosts */ 394 stack = GetHostStack (PCONTROL_HOST_IDLE); 395 Nobject = stack[0].Nobject; 396 for (i = 0; i < Nobject; i++) { 397 host = PullStackByLocation (stack, STACK_TOP); 398 if (host == NULL) break; 399 if (CheckResetHost (host)) { 400 return (1); 401 } 402 PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM); 403 } 404 return (0); 405 } 388 406 389 407 /* Loop through objects on the stack, no more than once. see note above */ -
trunk/Ohana/src/opihi/pcontrol/HostOps.c
r19124 r25872 202 202 host[0].HostID = NextHostID(); 203 203 204 host[0].last try.tv_sec = 0;205 host[0].last try.tv_usec = 0;206 host[0].next try.tv_sec = 0;207 host[0].next try.tv_usec = 0;204 host[0].last_start_try.tv_sec = 0; 205 host[0].last_start_try.tv_usec = 0; 206 host[0].next_start_try.tv_sec = 0; 207 host[0].next_start_try.tv_usec = 0; 208 208 209 209 InitIOBuffer (&host[0].comms_buffer, 0x100); -
trunk/Ohana/src/opihi/pcontrol/StartHost.c
r12840 r25872 21 21 if (VerboseMode()) gprint (GP_ERR, "failure to start %s\n", host[0].hostname); 22 22 gettimeofday (&now, (void *) NULL); 23 if (ZTIME(host[0].next try) || ZTIME(host[0].lasttry)) {23 if (ZTIME(host[0].next_start_try) || ZTIME(host[0].last_start_try)) { 24 24 /* reset retry period if either is zero */ 25 25 delta = RETRY_BASE; 26 26 } else { 27 delta = 2*DTIME (host[0].nexttry, host[0].lasttry);27 delta = MAX(1.0, 2*DTIME (host[0].next_start_try, host[0].last_start_try)); 28 28 } 29 host[0].next try.tv_sec = now.tv_sec + delta;30 host[0].next try.tv_usec = now.tv_usec;31 host[0].last try.tv_sec = now.tv_sec;32 host[0].last try.tv_usec = now.tv_usec;29 host[0].next_start_try.tv_sec = now.tv_sec + delta; 30 host[0].next_start_try.tv_usec = now.tv_usec; 31 host[0].last_start_try.tv_sec = now.tv_sec; 32 host[0].last_start_try.tv_usec = now.tv_usec; 33 33 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); 34 34 return (FALSE); 35 35 } 36 host[0].nexttry.tv_sec = 0; 37 host[0].nexttry.tv_usec = 0; 38 host[0].lasttry.tv_sec = 0; 39 host[0].lasttry.tv_usec = 0; 36 host[0].next_start_try.tv_sec = 0; 37 host[0].next_start_try.tv_usec = 0; 38 host[0].last_start_try.tv_sec = 0; 39 host[0].last_start_try.tv_usec = 0; 40 41 // set the connection time 42 gettimeofday (&host[0].connect_time, (void *) NULL); 40 43 41 44 host[0].stdin_fd = stdio[0]; -
trunk/Ohana/src/opihi/pcontrol/StopHosts.c
r18098 r25872 69 69 } 70 70 71 int StopHost (Host *host ) {71 int StopHost (Host *host, int mode) { 72 72 73 73 int status; 74 74 75 status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST); 75 switch (mode) { 76 case PCONTROL_HOST_DOWN: 77 status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_DOWN_HOST); 78 break; 79 case PCONTROL_HOST_OFF: 80 status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST); 81 break; 82 default: 83 ABORT ("programming error: invalid StopHost mode"); 84 } 76 85 77 86 /* check on success of pclient command */ 78 87 switch (status) { 79 88 case PCLIENT_DOWN: 80 // XXX this is the desired result in any case, so ignore it81 break;89 PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM); 90 return (TRUE); 82 91 83 92 case PCLIENT_GOOD: … … 96 105 97 106 OffHost (host); 107 HarvestHost (host[0].pid); 108 return (TRUE); 109 } 110 111 int DownHostResponse (Host *host) { 112 113 DownHost (host); 98 114 HarvestHost (host[0].pid); 99 115 return (TRUE); … … 134 150 135 151 case 0: 136 gprint (GP_ERR, "child did not exit??"); 137 abort (); 138 /** put back in IDLE state? **/ 152 gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung"); 139 153 break; 140 154 -
trunk/Ohana/src/opihi/pcontrol/host.c
r19124 r25872 53 53 } 54 54 /* reset time, place back on ALLHOSTS stack */ 55 host[0].next try.tv_sec = 0;56 host[0].next try.tv_usec = 0;57 host[0].last try.tv_sec = 0;58 host[0].last try.tv_usec = 0;55 host[0].next_start_try.tv_sec = 0; 56 host[0].next_start_try.tv_usec = 0; 57 host[0].last_start_try.tv_sec = 0; 58 host[0].last_start_try.tv_usec = 0; 59 59 PushStack (AllHosts, STACK_BOTTOM, host, host[0].HostID, host[0].hostname); 60 60 return (TRUE);
Note:
See TracChangeset
for help on using the changeset viewer.
