IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 8546


Ignore:
Timestamp:
Aug 23, 2006, 5:31:09 PM (20 years ago)
Author:
eugene
Message:

cleanup of timeouts and thread-safety issues

Location:
trunk/Ohana/src/opihi/pcontrol
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/pcontrol/CheckPoint.c

    r8424 r8546  
    3030    }
    3131    pthread_mutex_unlock (&client);
    32     usleep (10000);
     32    usleep (10000); // wait for client thread to set lock
    3333  }
    3434  // put in a timeout?  (client thread not spinning...)
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r8424 r8546  
    5555void *CheckSystem_Threaded (void *data) {
    5656
    57   struct timeval now;
    58   float dtime;
     57  int Njobchecks, Nhostchecks, Nlivechecks;
     58
     59  Nlivechecks = 0;
    5960
    6061  gprintInit ();
     
    6667    // don't run the system checks if RunSystem is FALSE
    6768    if (!RunSystem) {
    68       usleep (50000);
     69      usleep (50000); // idle if RunSystem is FALSE
    6970      continue;
    7071    }
    7172
     73    Njobchecks = 0;
     74    Nhostchecks = 0;
     75
    7276    // we want to give each block a maximum allowed time
    73     CheckIdleHosts(0.020); /* submit a new job */
    74 
    75     CheckBusyJobs(0.020);  /* get job status */
    76     CheckDoneJobs(0.020);  /* harvest job stdout/stderr */
    77     CheckKillJobs(0.020);  /* harvest job stdout/stderr */
    78 
    79     CheckDoneHosts(0.020); /* reset the host */
    80     CheckDownHosts(0.100); /* launch the host */
     77    Nhostchecks += CheckIdleHosts(0.020); /* submit a new job */
     78
     79    Njobchecks  += CheckBusyJobs(0.020);  /* get job status */
     80    Njobchecks  += CheckDoneJobs(0.020);  /* harvest job stdout/stderr */
     81    Njobchecks  += CheckKillJobs(0.020);  /* harvest job stdout/stderr */
     82
     83    Nhostchecks += CheckDoneHosts(0.020); /* reset the host */
     84    Nhostchecks += CheckDownHosts(0.100); /* launch the host */
    8185
    8286    /* always allow at least one test */
     
    8488       CheckDoneJobs must depend on the size of the output buffer */
    8589
    86     gettimeofday (&now, (void *) NULL);
    87     dtime = DTIME (now, lastlive);
    88     if (dtime > 1.0) {
     90    // there is nothing on the stacks.  test the hosts and wait a bit
     91    if (!Njobchecks && !Nhostchecks) {
    8992      CheckLiveHosts(0.040);
    90       lastlive = now;
     93      usleep (100000); // idle if no jobs are waiting
    9194    }
    9295
     
    145148  }
    146149  if (DEBUG && (Nobject > 0)) gprint (GP_ERR, "checked %d of %d jobs\n", i, Nobject);
    147   return (TRUE);
     150  return (i);
    148151}
    149152
     
    182185  }
    183186  if (DEBUG && (Nobject > 0)) gprint (GP_ERR, "checked %d of %d jobs\n", i, Nobject);
    184   return (TRUE);
     187  return (i);
    185188}
    186189
     
    219222  }
    220223  if (DEBUG && (Nobject > 0)) gprint (GP_ERR, "checked %d of %d jobs\n", i, Nobject);
    221   return (TRUE);
     224  return (i);
    222225}
    223226
     
    245248  }
    246249  if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i);
    247   return (TRUE);
     250  return (i);
    248251}
    249252
     
    281284  }
    282285  if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i);
    283   return (TRUE);
     286  return (i);
    284287}
    285288
     
    294297  /* check if there are any pending jobs, otherwise skip step */
    295298  stack = GetJobStack (PCONTROL_JOB_PENDING);
    296   if (!stack[0].Nobject) return (TRUE);
     299  if (!stack[0].Nobject) return (0);
    297300
    298301  /* Loop through objects on the stack, no more than once. see note above */
     
    311314  }
    312315  if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i);
    313   return (TRUE);
     316  return (i);
    314317}
    315318
  • trunk/Ohana/src/opihi/pcontrol/Makefile

    r8424 r8546  
    2323LFLAGS  =       $(LIBS) $(LIBS2) $(LIBS1)
    2424
    25 # mana user commands and support functions ########################
     25# to build the non-threaded version, remove -lpthread and comment out
     26# the THREADED line in include/pcontrol.h
     27
     28# pcontrol user commands and support functions ########################
    2629
    2730funcs = \
  • trunk/Ohana/src/opihi/pcontrol/PclientCommand.c

    r8424 r8546  
    2020
    2121  /* is pipe still open? */
    22   if ((status == -1) && (errno == EPIPE)) return (PCLIENT_DOWN);
     22  if ((status == -1) && (errno == EPIPE)) {
     23    // gprint (GP_ERR, "pclient read gives pipe error for %s\n", command);
     24    return (PCLIENT_DOWN);
     25  }
    2326 
    2427  /* watch for response - wait up to 1 second */
     
    3033    if (status == -1) nanosleep (&request, &remain);
    3134  }
    32   if (status ==  0) return (PCLIENT_DOWN);
     35  if (status ==  0) {
     36    // gprint (GP_ERR, "pclient read returns 0 for %s\n", command);
     37    return (PCLIENT_DOWN);
     38  }
    3339  if (status == -1) return (PCLIENT_HUNG);
    3440  /* gprint (GP_ERR, "buffer.buffer: %s\n", buffer[0].buffer); */
  • trunk/Ohana/src/opihi/pcontrol/StartHost.c

    r8424 r8546  
    1414  if (VarConfig ("SHELL", "%s", shell)     == NULL) strcpy (shell, "pclient");
    1515
    16   gprint (GP_ERR, "starting host within thread %d\n", pthread_self());
     16  if (VerboseMode()) gprint (GP_ERR, "starting host within thread %d\n", pthread_self());
    1717
    1818  pid = rconnect (command, host[0].hostname, shell, stdio);
  • trunk/Ohana/src/opihi/pcontrol/StopHosts.c

    r8424 r8546  
    1717}
    1818
     19/* for use by shutdown: force machines which are up to go down
     20   wait for a little while for the client thread to take care
     21   of them
     22*/
     23   
    1924int DownHosts () {
    2025
     26  int i, Nobject, Nwait;
    2127  Stack *stack;
    2228  Host  *host;
    2329
     30  SetCheckPoint (); // ensure we can find the specified host
    2431  stack = GetHostStack (PCONTROL_HOST_IDLE);
    25   while ((host = PullStackByLocation (stack, STACK_BOTTOM)) != NULL) {
    26     StopHost (host);
    27     DownHost (host);
     32  Nobject = stack[0].Nobject;
     33  for (i = 0; i < Nobject; i++) {
     34    host = PullStackByLocation (stack, STACK_TOP);
     35    if (host == NULL) continue;
     36    host[0].markoff = TRUE;
     37    PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
    2838  }
    2939
    3040  stack = GetHostStack (PCONTROL_HOST_BUSY);
    31   while ((host = PullStackByLocation (stack, STACK_BOTTOM)) != NULL) {
    32     StopHost (host);
    33     DownHost (host);
     41  Nobject = stack[0].Nobject;
     42  for (i = 0; i < Nobject; i++) {
     43    host = PullStackByLocation (stack, STACK_TOP);
     44    if (host == NULL) continue;
     45    host[0].markoff = TRUE;
     46    PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
     47  }
     48  ClearCheckPoint ();
     49
     50  Nwait = 0;
     51  stack = GetHostStack (PCONTROL_HOST_IDLE);
     52  gprint (GP_ERR, "waiting for clients to exit");
     53  while ((Nwait < 15) && stack[0].Nobject) {
     54    gprint (GP_ERR, ".");
     55    usleep (100000); // wait for clients to exit
     56    Nwait++;
     57  }
     58  gprint (GP_ERR, "\n");
     59  if (stack[0].Nobject) {
     60    gprint (GP_ERR, "trouble shutting down all pclient instances: %d still alive\n", stack[0].Nobject);
     61  } else {
     62    gprint (GP_ERR, "done\n");
    3463  }
    3564  return (TRUE);
     
    6998  int i, result, waitstatus;
    7099
    71   gprint (GP_ERR, "harvesting within thread %d\n", pthread_self());
    72   gprint (GP_ERR, "child process %d is down, wait for exit status\n", pid);
     100  // gprint (GP_ERR, "harvesting within thread %d\n", pthread_self());
     101  // gprint (GP_ERR, "child process %d is down, wait for exit status\n", pid);
    73102 
    74103  // Loop a few times waiting for child to exit
     
    76105    result = waitpid (pid, &waitstatus, WNOHANG);
    77106    if ((result == -1) && (errno == ECHILD)) {
    78       usleep (10000);
     107      usleep (10000); // wait for child to exit
    79108      continue;
    80109    } else {
     
    109138     
    110139      if (WIFEXITED(waitstatus)) {
    111         gprint (GP_ERR, "child exited with status %d\n", WEXITSTATUS(waitstatus));
     140        if (VerboseMode()) gprint (GP_ERR, "child exited with status %d\n", WEXITSTATUS(waitstatus));
    112141      }
    113142      if (WIFSIGNALED(waitstatus)) {
    114         gprint (GP_ERR, "child crashed with status %d\n", WTERMSIG(waitstatus));
     143        if (VerboseMode()) gprint (GP_ERR, "child crashed with status %d\n", WTERMSIG(waitstatus));
    115144      }
    116145      if (WIFSTOPPED(waitstatus)) {
    117         gprint (GP_ERR, "waitpid returns 'stopped': programming error\n");
     146        if (VerboseMode()) gprint (GP_ERR, "waitpid returns 'stopped': programming error\n");
    118147        exit (1);
    119148      }
  • trunk/Ohana/src/opihi/pcontrol/pcontrol.c

    r8424 r8546  
    2525  InitHostStacks ();
    2626
     27  /* set global signal masks (these apply to all threads launched below) */
     28  signal (SIGPIPE, gotsignal);
     29  signal (SIGTSTP, gotsignal);
     30  signal (SIGTTIN, gotsignal);
     31
    2732  rl_readline_name = opihi_name;
    2833  rl_attempted_completion_function = command_completer;
    2934# ifdef THREADED
     35  SetRunSystem (TRUE);
    3036  pthread_create (&clientsThread, NULL, &CheckSystem_Threaded, NULL);
    3137  rl_event_hook = NULL;
     
    4551
    4652  /* ignore the history file.  to change this, see, eg, mana.c */
    47   signal (SIGPIPE, gotsignal);
    48   signal (SIGTSTP, gotsignal);
    49   signal (SIGTTIN, gotsignal);
    5053  return;
    5154}
  • trunk/Ohana/src/opihi/pcontrol/rconnect.c

    r7917 r8546  
    8181    status = ReadtoIOBuffer (&buffer, stdout_fd[0]);
    8282    p = memstr (buffer.buffer, "CONNECTED", buffer.Nbuffer);
    83     usleep (10000);
     83    usleep (10000); // wait for client to be connected
    8484  }
    8585  if (status == 0) goto connect_error;
Note: See TracChangeset for help on using the changeset viewer.