Changeset 28158
- Timestamp:
- May 28, 2010, 11:51:46 AM (16 years ago)
- Location:
- trunk/Ohana/src/opihi
- Files:
-
- 6 edited
-
include/pcontrol.h (modified) (1 diff)
-
pcontrol/CheckIdleHost.c (modified) (6 diffs)
-
pcontrol/CheckSystem.c (modified) (1 diff)
-
pcontrol/StopHosts.c (modified) (4 diffs)
-
pcontrol/pcontrol.c.in (modified) (2 diffs)
-
pcontrol/test/machines.sh (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/Ohana/src/opihi/include/pcontrol.h
r26411 r28158 273 273 int StopHostResponse (Host *host); 274 274 int HarvestHost (int pid); 275 int AddZombie(int pid); 276 int DelZombies(); 277 int CheckZombies(); 275 278 276 279 /*** JobOps.c ***/ -
trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c
r26411 r28158 8 8 static float MAX_WANTHOST_WAIT = 10.0; 9 9 static float MAX_CONNECT_TIME = 36000.0; 10 static FILE *logfile = NULL;11 10 12 11 /* if this host has been connected for too long, disconnect (will automatically reconnect) */ … … 35 34 struct timeval now; 36 35 float dtime; 37 38 if (logfile == NULL) {39 logfile = fopen ("pcontrol.log", "w");40 }41 36 42 37 ASSERT (host, "host not set"); … … 69 64 host[0].job = (struct Job *) job; 70 65 71 // if (logfile) fprintf (logfile, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);66 // gprint (GP_ERR, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 72 67 AddMachineJob (host, job); 73 68 … … 90 85 host[0].job = (struct Job *) job; 91 86 92 // if (logfile) fprintf (logfile, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);87 // gprint (GP_ERR, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 93 88 AddMachineJob (host, job); 94 89 … … 111 106 host[0].job = (struct Job *) job; 112 107 113 // if (logfile) fprintf (logfile, "start anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);108 // gprint (GP_ERR, "start anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 114 109 AddMachineJob (host, job); 115 110 … … 133 128 if (!CheckMachineJobs (host, job)) continue; 134 129 135 if (logfile) fprintf (logfile, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);130 gprint (GP_ERR, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 136 131 AddMachineJob (host, job); 137 132 -
trunk/Ohana/src/opihi/pcontrol/CheckSystem.c
r26411 r28158 107 107 TestCheckPoint (); 108 108 Nhostchecks += CheckDownHosts(0.100); /* launch the host */ 109 TestCheckPoint (); 110 CheckZombies(); /* launch the host */ 109 111 TestCheckPoint (); 110 112 } -
trunk/Ohana/src/opihi/pcontrol/StopHosts.c
r27592 r28158 1 1 # include "pcontrol.h" 2 3 // we attempt to harvest the 'down' hosts in HarvestHost. However, sometimes the 4 // child is busy and does not exit in the timeout period. we need to keep a list and 5 // try again occasionally to free up the needed resources 6 static int NUNHARVESTED = 0; 7 static int Nunharvested = 0; 8 static int *unharvested = NULL; 2 9 3 10 void DownHost (Host *host) { … … 140 147 switch (errno) { 141 148 case ECHILD: 142 gprint (GP_ERR, " unknown PID, not a child proc\n");149 gprint (GP_ERR, "HarvestHost: unknown PID (%d), not a child proc\n", pid); 143 150 gprint (GP_ERR, "did process already exit? programming error?\n"); 144 151 break; … … 152 159 153 160 case 0: 154 gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung"); 161 gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung\n"); 162 AddZombie(pid); 155 163 break; 156 164 … … 174 182 return (TRUE); 175 183 } 184 185 int AddZombie(int pid) { 186 187 if (unharvested == NULL) { 188 NUNHARVESTED = 128; 189 ALLOCATE (unharvested, int, NUNHARVESTED); 190 memset (unharvested, 0, NUNHARVESTED*sizeof(int)); 191 } 192 unharvested[Nunharvested] = pid; 193 194 Nunharvested ++; 195 if (Nunharvested >= NUNHARVESTED) { 196 NUNHARVESTED += 128; 197 REALLOCATE (unharvested, int, NUNHARVESTED); 198 memset (&unharvested[Nunharvested], 0, (NUNHARVESTED - Nunharvested)*sizeof(int)); 199 } 200 return TRUE; 201 } 202 203 int DelZombies() { 204 205 int i, j; 206 207 if (!unharvested) return FALSE; 208 if (!Nunharvested) return FALSE; 209 if (!NUNHARVESTED) return FALSE; 210 211 int *newlist = NULL; 212 213 ALLOCATE (newlist, int, NUNHARVESTED); 214 memset (newlist, 0, NUNHARVESTED*sizeof(int)); 215 216 j = 0; 217 for (i = 0; i < NUNHARVESTED; i++) { 218 if (!unharvested[i]) continue; 219 newlist[j] = unharvested[i]; 220 j++; 221 } 222 free (unharvested); 223 unharvested = newlist; 224 Nunharvested = j; 225 return TRUE; 226 } 227 228 int CheckZombies() { 229 230 int pid, i, result, waitstatus; 231 232 if (!unharvested) return FALSE; 233 if (!Nunharvested) return FALSE; 234 if (!NUNHARVESTED) return FALSE; 235 236 for (i = 0; i < Nunharvested; i++) { 237 if (!unharvested[i]) continue; 238 pid = unharvested[i]; 239 result = waitpid (pid, &waitstatus, WNOHANG); 240 switch (result) { 241 case -1: /* error with waitpid */ 242 switch (errno) { 243 case ECHILD: 244 gprint (GP_ERR, "CheckZombies: unknown PID (%d), not a child proc\n", pid); 245 gprint (GP_ERR, "did process already exit? programming error?\n"); 246 break; 247 case EINTR: 248 case EINVAL: 249 default: 250 perror ("unexpected error"); 251 ABORT ("CheckZombies impossible condition"); 252 } 253 break; 254 255 case 0: 256 if (VerboseMode()) gprint (GP_ERR, "CheckZombies: still waiting on %d\n", pid); 257 break; 258 259 default: 260 if (result != pid) { 261 gprint (GP_ERR, "waitpid error: mis-matched PID (%d vs %d). programming error\n", result, pid); 262 ABORT ("CheckZombies impossible condition"); 263 } 264 265 if (WIFEXITED(waitstatus)) { 266 if (VerboseMode()) gprint (GP_ERR, "child exited with status %d\n", WEXITSTATUS(waitstatus)); 267 } 268 if (WIFSIGNALED(waitstatus)) { 269 if (VerboseMode()) gprint (GP_ERR, "child crashed with status %d\n", WTERMSIG(waitstatus)); 270 } 271 if (WIFSTOPPED(waitstatus)) { 272 ABORT ("waitpid returns 'stopped': programming error\n"); 273 } 274 unharvested[i] = 0; 275 break; 276 } 277 } 278 DelZombies(); 279 return (TRUE); 280 } -
trunk/Ohana/src/opihi/pcontrol/pcontrol.c.in
r18098 r28158 1 1 # include "pcontrol.h" 2 3 # define STDERR_FILE "pcontrol.log" 2 4 3 5 # define opihi_name "PCONTROL" … … 24 26 signal (SIGTSTP, gotsignal); 25 27 signal (SIGTTIN, gotsignal); 28 29 // stdin / stdout are used for communication with pantasks. 30 // redirect stderr so various error messages are saved 31 stderr = freopen (STDERR_FILE, "a", stderr); 32 if (!stderr) { 33 fprintf (stdout, "failed to open %s for error output\n", STDERR_FILE); 34 exit (1); 35 } 26 36 27 37 rl_readline_name = opihi_name; -
trunk/Ohana/src/opihi/pcontrol/test/machines.sh
r26411 r28158 6 6 host add pikake 7 7 host add pikake 8 9 8 host add ipp022 10 9 host add ipp022 … … 26 25 end 27 26 27 macro load.hosts.zombie 28 parameters connect = 2.0 29 30 host add pikake 31 host add pikake 32 host add ipp022 33 host add ipp022 34 host add ipp022 35 host add ipp022 36 37 machines 38 end 39 40 macro load.jobs.zombie 41 job sleep 10 42 job sleep 10 43 job sleep 10 44 job sleep 10 45 job sleep 10 46 job sleep 10 47 job sleep 10 48 job sleep 10 49 end 50
Note:
See TracChangeset
for help on using the changeset viewer.
