IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 25872


Ignore:
Timestamp:
Oct 18, 2009, 10:29:11 AM (17 years ago)
Author:
eugene
Message:

updated pcontrol to restart pclients that have been connected for too long

Location:
trunk/Ohana/src/opihi
Files:
9 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/include/pcontrol.h

    r21379 r25872  
    6060  PCONTROL_RESP_KILL_JOB,
    6161  PCONTROL_RESP_STOP_HOST,
     62  PCONTROL_RESP_DOWN_HOST,
    6263} HostResp;
    6364
     
    129130  int         pid;
    130131  HostStat    stack;
    131   Ptime       lasttry;
    132   Ptime       nexttry;
    133132  IDtype      HostID;
    134133  IOBuffer    comms_buffer;
    135134  char       *response;
    136135  HostResp    response_state;
     136  Ptime       last_start_try; // last (UNIX) time we attempted to connect to this host (0 on success)
     137  Ptime       next_start_try; // next (UNIX) time we should attempt to connect to this host (0 on success)
     138  Ptime       connect_time; // (UNIX) time we connected to this host
    137139  struct Job *job;
    138140} Host;
     
    210212
    211213int StartHost (Host *host);
     214int CheckResetHost (Host *host);
    212215int CheckIdleHost (Host *host, int Stage);
    213216int CheckDoneJob (Job *job, Host *host);
     
    253256/*** StopHosts.c ***/
    254257void   DownHost (Host *host);
     258int    DownHosts (void);
     259int    DownHostResponse (Host *host);
    255260void   OffHost (Host *host);
    256 int    DownHosts (void);
    257 int    StopHost (Host *host);
     261int    StopHost (Host *host, int mode);
    258262int    StopHostResponse (Host *host);
    259263int    HarvestHost (int pid);
  • trunk/Ohana/src/opihi/pcontrol/CheckHost.c

    r18098 r25872  
    1414  if (host[0].markoff) {
    1515    host[0].markoff = FALSE;
    16     StopHost (host);
     16    StopHost (host, PCONTROL_HOST_OFF);
    1717    return (TRUE);
    1818  }
  • trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c

    r23330 r25872  
    11# include "pcontrol.h"
    22
     3// The connection to the remote host is only allow to live for MAX_CONNECT_TIME seconds.  We
     4// disconnect and reconnect if a remote host has been connected for too long.  This is a
     5// (temporary?) work-around for the problem that the remote pclient job tends to grow too large
     6// over time.
     7
     8# define MAX_CONNECT_TIME 36000.0
     9
    310static FILE *logfile = NULL;
     11
     12  /* if this host has been connected for too long, disconnect (will automatically reconnect) */
     13int CheckResetHost (Host *host) {
     14
     15  struct timeval now;
     16  float dtime;
     17
     18  /* if this host has been connected for too long, disconnect (will automatically reconnect) */
     19  gettimeofday (&now, (void *) NULL);
     20  dtime = DTIME (now, host[0].connect_time);
     21  if (dtime > MAX_CONNECT_TIME) {
     22      if (VerboseMode()) gprint (GP_ERR, "disconnect from %s\n", host[0].hostname);
     23      StopHost (host, PCONTROL_HOST_DOWN);
     24      return (TRUE);
     25  }
     26  return FALSE;
     27}
    428
    529/* the supplied host is not on a stack: it cannot be taken by the other thread */
     
    2145  if (host[0].markoff) {
    2246    host[0].markoff = FALSE;
    23     StopHost (host);
     47    StopHost (host, PCONTROL_HOST_OFF);
    2448    return (TRUE);
    2549  }
    2650   
     51  /* check if host has been connected for too long */
     52  if (CheckResetHost (host)) {
     53    return (TRUE);
     54  }
     55
    2756  /* search the JOB_PENDING stack for an appropriate job */
    2857  stack = GetJobStack (PCONTROL_JOB_PENDING);
  • trunk/Ohana/src/opihi/pcontrol/CheckRespHost.c

    r17476 r25872  
    3232      host[0].response = NULL;
    3333
    34       // host has shutdown; harvest the defunct process
     34      // if want the host to be shutdown, accept the result
     35      if (host[0].response_state == PCONTROL_RESP_DOWN_HOST) {
     36        if (DEBUG) fprintf (stderr, "PCONTROL_RESP_DOWN_HOST\n");
     37        DownHostResponse (host);
     38        return TRUE;
     39      }
     40      if (host[0].response_state == PCONTROL_RESP_STOP_HOST) {
     41        if (DEBUG) fprintf (stderr, "PCONTROL_RESP_STOP_HOST\n");
     42        StopHostResponse (host);
     43        return TRUE;
     44      }
     45
     46      // host has unexpectedly shutdown; harvest the defunct process
    3547      HarvestHost (host[0].pid);
    3648      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
     
    4759
    4860    case PCLIENT_GOOD:
    49       if (VerboseMode()) gprint (GP_ERR, "message received (CheckRespHost)\n"); 
    5061      break;
    5162
     
    8091      break;
    8192
     93    case PCONTROL_RESP_DOWN_HOST:
     94      if (DEBUG) fprintf (stderr, "PCONTROL_RESP_DOWN_HOST\n");
     95      status = DownHostResponse (host);
     96      break;
     97
    8298    case PCONTROL_RESP_STOP_HOST:
    8399      if (DEBUG) fprintf (stderr, "PCONTROL_RESP_STOP_HOST\n");
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r21379 r25872  
    357357      return (TRUE);
    358358    }
    359     dtime = DTIME (host[0].nexttry, start);
     359    dtime = DTIME (host[0].next_start_try, start);
    360360    if (dtime > 0) {
    361361      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
     
    383383  float dtime;
    384384
    385   /* check if there are any pending jobs, otherwise skip step */
     385  /* check if there are any pending jobs */
    386386  stack = GetJobStack (PCONTROL_JOB_PENDING);
    387   if (!stack[0].Nobject) return (0);
     387
     388  /* if there are no pending jobs and we are not in STAGE_NEEDHOST, skip test */
     389  if (!stack[0].Nobject && (Stage != PCONTROL_JOB_STAGE_NEEDHOST)) return (0);
     390
     391  /* if there are no pending jobs, check for hosts that need to be reset */
     392  if (!stack[0].Nobject) {
     393    /* cycle through IDLE hosts */
     394    stack = GetHostStack (PCONTROL_HOST_IDLE);
     395    Nobject = stack[0].Nobject;
     396    for (i = 0; i < Nobject; i++) {
     397      host = PullStackByLocation (stack, STACK_TOP);
     398      if (host == NULL) break;
     399      if (CheckResetHost (host)) {
     400        return (1);
     401      }
     402      PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
     403    }
     404    return (0);
     405  }
    388406
    389407  /* Loop through objects on the stack, no more than once. see note above */
  • trunk/Ohana/src/opihi/pcontrol/HostOps.c

    r19124 r25872  
    202202  host[0].HostID      = NextHostID();
    203203
    204   host[0].lasttry.tv_sec  = 0;
    205   host[0].lasttry.tv_usec = 0;
    206   host[0].nexttry.tv_sec  = 0;
    207   host[0].nexttry.tv_usec = 0;
     204  host[0].last_start_try.tv_sec  = 0;
     205  host[0].last_start_try.tv_usec = 0;
     206  host[0].next_start_try.tv_sec  = 0;
     207  host[0].next_start_try.tv_usec = 0;
    208208
    209209  InitIOBuffer (&host[0].comms_buffer, 0x100);
  • trunk/Ohana/src/opihi/pcontrol/StartHost.c

    r12840 r25872  
    2121    if (VerboseMode()) gprint (GP_ERR, "failure to start %s\n", host[0].hostname);
    2222    gettimeofday (&now, (void *) NULL);
    23     if (ZTIME(host[0].nexttry) || ZTIME(host[0].lasttry)) {
     23    if (ZTIME(host[0].next_start_try) || ZTIME(host[0].last_start_try)) {
    2424      /* reset retry period if either is zero */
    2525      delta = RETRY_BASE;
    2626    } else {
    27       delta = 2*DTIME (host[0].nexttry, host[0].lasttry);
     27      delta = MAX(1.0, 2*DTIME (host[0].next_start_try, host[0].last_start_try));
    2828    }
    29     host[0].nexttry.tv_sec  = now.tv_sec  + delta;
    30     host[0].nexttry.tv_usec = now.tv_usec;
    31     host[0].lasttry.tv_sec  = now.tv_sec;
    32     host[0].lasttry.tv_usec = now.tv_usec;
     29    host[0].next_start_try.tv_sec  = now.tv_sec  + delta;
     30    host[0].next_start_try.tv_usec = now.tv_usec;
     31    host[0].last_start_try.tv_sec  = now.tv_sec;
     32    host[0].last_start_try.tv_usec = now.tv_usec;
    3333    PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
    3434    return (FALSE);
    3535  }
    36   host[0].nexttry.tv_sec  = 0;
    37   host[0].nexttry.tv_usec = 0;
    38   host[0].lasttry.tv_sec  = 0;
    39   host[0].lasttry.tv_usec = 0;
     36  host[0].next_start_try.tv_sec  = 0;
     37  host[0].next_start_try.tv_usec = 0;
     38  host[0].last_start_try.tv_sec  = 0;
     39  host[0].last_start_try.tv_usec = 0;
     40
     41  // set the connection time
     42  gettimeofday (&host[0].connect_time, (void *) NULL);
    4043
    4144  host[0].stdin_fd  = stdio[0];
  • trunk/Ohana/src/opihi/pcontrol/StopHosts.c

    r18098 r25872  
    6969}
    7070
    71 int StopHost (Host *host) {
     71int StopHost (Host *host, int mode) {
    7272
    7373  int       status;
    7474
    75   status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST);
     75  switch (mode) {
     76    case PCONTROL_HOST_DOWN:
     77      status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_DOWN_HOST);
     78      break;
     79    case PCONTROL_HOST_OFF:
     80      status = PclientCommand (host, "exit", "Goodbye", PCONTROL_RESP_STOP_HOST);
     81      break;
     82    default:
     83      ABORT ("programming error: invalid StopHost mode");
     84  }
    7685
    7786  /* check on success of pclient command */
    7887  switch (status) {
    7988    case PCLIENT_DOWN:
    80       // XXX this is the desired result in any case, so ignore it
    81       break;
     89      PutHost (host, PCONTROL_HOST_RESP, STACK_BOTTOM);
     90      return (TRUE);
    8291
    8392    case PCLIENT_GOOD:
     
    96105
    97106  OffHost (host);
     107  HarvestHost (host[0].pid);
     108  return (TRUE);
     109}
     110
     111int DownHostResponse (Host *host) {
     112
     113  DownHost (host);
    98114  HarvestHost (host[0].pid);
    99115  return (TRUE);
     
    134150     
    135151    case 0:
    136       gprint (GP_ERR, "child did not exit??");
    137       abort ();
    138       /** put back in IDLE state? **/
     152      gprint (GP_ERR, "HarvestHost: child with connection to remote host failed to exit: may be hung");
    139153      break;
    140154
  • trunk/Ohana/src/opihi/pcontrol/host.c

    r19124 r25872  
    5353    }
    5454    /* reset time, place back on ALLHOSTS stack */
    55     host[0].nexttry.tv_sec  = 0;
    56     host[0].nexttry.tv_usec = 0;
    57     host[0].lasttry.tv_sec  = 0;
    58     host[0].lasttry.tv_usec = 0;
     55    host[0].next_start_try.tv_sec  = 0;
     56    host[0].next_start_try.tv_usec = 0;
     57    host[0].last_start_try.tv_sec  = 0;
     58    host[0].last_start_try.tv_usec = 0;
    5959    PushStack (AllHosts, STACK_BOTTOM, host, host[0].HostID, host[0].hostname);
    6060    return (TRUE);
Note: See TracChangeset for help on using the changeset viewer.