IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 28158


Ignore:
Timestamp:
May 28, 2010, 11:51:46 AM (16 years ago)
Author:
eugene
Message:

redirect pcontrol stderr to log file; add bits to harvest zombies that may accumulate

Location:
trunk/Ohana/src/opihi
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/include/pcontrol.h

    r26411 r28158  
    273273int    StopHostResponse (Host *host);
    274274int    HarvestHost (int pid);
     275int    AddZombie(int pid);
     276int    DelZombies();
     277int    CheckZombies();
    275278
    276279/*** JobOps.c ***/
  • trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c

    r26411 r28158  
    88static float MAX_WANTHOST_WAIT = 10.0;
    99static float MAX_CONNECT_TIME = 36000.0;
    10 static FILE *logfile = NULL;
    1110
    1211  /* if this host has been connected for too long, disconnect (will automatically reconnect) */
     
    3534  struct timeval now;
    3635  float dtime;
    37 
    38   if (logfile == NULL) {
    39     logfile = fopen ("pcontrol.log", "w");
    40   }
    4136
    4237  ASSERT (host, "host not set");
     
    6964    host[0].job = (struct Job *) job;
    7065
    71     // if (logfile) fprintf (logfile, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     66    // gprint (GP_ERR, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
    7267    AddMachineJob (host, job);
    7368
     
    9085    host[0].job = (struct Job *) job;
    9186
    92     // if (logfile) fprintf (logfile, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     87    // gprint (GP_ERR, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
    9388    AddMachineJob (host, job);
    9489
     
    111106    host[0].job = (struct Job *) job;
    112107
    113     // if (logfile) fprintf (logfile, "start  anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     108    // gprint (GP_ERR, "start  anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
    114109    AddMachineJob (host, job);
    115110
     
    133128    if (!CheckMachineJobs (host, job)) continue;
    134129
    135     if (logfile) fprintf (logfile, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     130    gprint (GP_ERR, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
    136131    AddMachineJob (host, job);
    137132
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r26411 r28158  
    107107      TestCheckPoint ();
    108108      Nhostchecks += CheckDownHosts(0.100); /* launch the host */
     109      TestCheckPoint ();
     110      CheckZombies(); /* launch the host */
    109111      TestCheckPoint ();
    110112    }
  • trunk/Ohana/src/opihi/pcontrol/StopHosts.c

    r27592 r28158  
    11# include "pcontrol.h"
     2
     3// we attempt to harvest the 'down' hosts in HarvestHost.  However, sometimes the
     4// child is busy and does not exit in the timeout period.  we need to keep a list and
     5// try again occasionally to free up the needed resources
     6static int NUNHARVESTED = 0;
     7static int Nunharvested = 0;
     8static int *unharvested = NULL;
    29
    310void DownHost (Host *host) {
     
    140147      switch (errno) {
    141148        case ECHILD:
    142           gprint (GP_ERR, "unknown PID, not a child proc\n");
     149          gprint (GP_ERR, "HarvestHost: unknown PID (%d), not a child proc\n", pid);
    143150          gprint (GP_ERR, "did process already exit?  programming error?\n");
    144151          break;
     
    152159     
    153160    case 0:
    154       gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung");
     161      gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung\n");
     162      AddZombie(pid);
    155163      break;
    156164
     
    174182  return (TRUE);
    175183}
     184
     185int AddZombie(int pid) {
     186
     187  if (unharvested == NULL) {
     188    NUNHARVESTED = 128;
     189    ALLOCATE (unharvested, int, NUNHARVESTED);
     190    memset (unharvested, 0, NUNHARVESTED*sizeof(int));
     191  }
     192  unharvested[Nunharvested] = pid;
     193
     194  Nunharvested ++;
     195  if (Nunharvested >= NUNHARVESTED) {
     196    NUNHARVESTED += 128;
     197    REALLOCATE (unharvested, int, NUNHARVESTED);
     198    memset (&unharvested[Nunharvested], 0, (NUNHARVESTED - Nunharvested)*sizeof(int));
     199  }
     200  return TRUE;
     201}
     202
     203int DelZombies() {
     204
     205  int i, j;
     206
     207  if (!unharvested) return FALSE;
     208  if (!Nunharvested) return FALSE;
     209  if (!NUNHARVESTED) return FALSE;
     210
     211  int *newlist = NULL;
     212
     213  ALLOCATE (newlist, int, NUNHARVESTED);
     214  memset (newlist, 0, NUNHARVESTED*sizeof(int));
     215
     216  j = 0;
     217  for (i = 0; i < NUNHARVESTED; i++) {
     218    if (!unharvested[i]) continue;
     219    newlist[j] = unharvested[i];
     220    j++;
     221  }
     222  free (unharvested);
     223  unharvested = newlist;
     224  Nunharvested = j;
     225  return TRUE;
     226}
     227
     228int CheckZombies() {
     229
     230  int pid, i, result, waitstatus;
     231
     232  if (!unharvested) return FALSE;
     233  if (!Nunharvested) return FALSE;
     234  if (!NUNHARVESTED) return FALSE;
     235
     236  for (i = 0; i < Nunharvested; i++) {
     237    if (!unharvested[i]) continue;
     238    pid = unharvested[i];
     239    result = waitpid (pid, &waitstatus, WNOHANG);
     240    switch (result) {
     241      case -1:  /* error with waitpid */
     242        switch (errno) {
     243          case ECHILD:
     244            gprint (GP_ERR, "CheckZombies: unknown PID (%d), not a child proc\n", pid);
     245            gprint (GP_ERR, "did process already exit?  programming error?\n");
     246            break;
     247          case EINTR:
     248          case EINVAL:
     249          default:
     250            perror ("unexpected error");
     251            ABORT ("CheckZombies impossible condition");
     252        }
     253        break;
     254     
     255      case 0:
     256        if (VerboseMode()) gprint (GP_ERR, "CheckZombies: still waiting on %d\n", pid);
     257        break;
     258
     259      default:
     260        if (result != pid) {
     261          gprint (GP_ERR, "waitpid error: mis-matched PID (%d vs %d).  programming error\n", result, pid);
     262          ABORT ("CheckZombies impossible condition");
     263        }
     264       
     265        if (WIFEXITED(waitstatus)) {
     266          if (VerboseMode()) gprint (GP_ERR, "child exited with status %d\n", WEXITSTATUS(waitstatus));
     267        }
     268        if (WIFSIGNALED(waitstatus)) {
     269          if (VerboseMode()) gprint (GP_ERR, "child crashed with status %d\n", WTERMSIG(waitstatus));
     270        }
     271        if (WIFSTOPPED(waitstatus)) {
     272          ABORT ("waitpid returns 'stopped': programming error\n");
     273        }
     274        unharvested[i] = 0;
     275        break;
     276    }
     277  }
     278  DelZombies();
     279  return (TRUE);
     280}
  • trunk/Ohana/src/opihi/pcontrol/pcontrol.c.in

    r18098 r28158  
    11# include "pcontrol.h"
     2
     3# define STDERR_FILE "pcontrol.log"
    24
    35# define opihi_name "PCONTROL"
     
    2426  signal (SIGTSTP, gotsignal);
    2527  signal (SIGTTIN, gotsignal);
     28
     29  // stdin / stdout are used for communication with pantasks.
     30  // redirect stderr so various error messages are saved
     31  stderr = freopen (STDERR_FILE, "a", stderr);
     32  if (!stderr) {
     33      fprintf (stdout, "failed to open %s for error output\n", STDERR_FILE);
     34      exit (1);
     35  }
    2636
    2737  rl_readline_name = opihi_name;
  • trunk/Ohana/src/opihi/pcontrol/test/machines.sh

    r26411 r28158  
    66  host add pikake
    77  host add pikake
    8 
    98  host add ipp022
    109  host add ipp022
     
    2625end
    2726
     27macro load.hosts.zombie
     28  parameters connect = 2.0
     29
     30  host add pikake
     31  host add pikake
     32  host add ipp022
     33  host add ipp022
     34  host add ipp022
     35  host add ipp022
     36
     37  machines
     38end
     39
     40macro load.jobs.zombie
     41  job sleep 10
     42  job sleep 10
     43  job sleep 10
     44  job sleep 10
     45  job sleep 10
     46  job sleep 10
     47  job sleep 10
     48  job sleep 10
     49end
     50
Note: See TracChangeset for help on using the changeset viewer.