IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 21379


Ignore:
Timestamp:
Feb 6, 2009, 10:37:19 AM (17 years ago)
Author:
eugene
Message:

report pending / process time with status; cycle over all hosts for each of NEED, WANT, ANY, OLDWANT; wait up to 10 sec for WANT before giving up

Location:
trunk/Ohana/src/opihi
Files:
1 added
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/include/pcontrol.h

    r21153 r21379  
    2525  PCONTROL_JOB_NEEDHOST,
    2626} JobMode;
     27
     28/** job mode check stages **/
     29typedef enum {
     30  PCONTROL_JOB_STAGE_ANYHOST,
     31  PCONTROL_JOB_STAGE_WANTHOST,
     32  PCONTROL_JOB_STAGE_NEEDHOST,
     33  PCONTROL_JOB_STAGE_OLDWANT,
     34} JobCheckStage;
    2735
    2836/** job thread options values **/
     
    180188int   CheckDoneHosts (float delay);
    181189int   CheckDownHosts (float delay);
    182 int   CheckIdleHosts (float delay);
     190int   CheckIdleHosts (float delay, int Stage);
    183191int   CheckLiveHosts (float delay);
    184192int   SetRunSystem (int state);
     
    202210
    203211int StartHost (Host *host);
    204 int CheckIdleHost (Host *host);
     212int CheckIdleHost (Host *host, int Stage);
    205213int CheckDoneJob (Job *job, Host *host);
    206214int GetJobOutput (char *command, Host *host, JobOutput *output);
  • trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c

    r20324 r21379  
    44
    55/* the supplied host is not on a stack: it cannot be taken by the other thread */
    6 int CheckIdleHost (Host *host) {
     6int CheckIdleHost (Host *host, int Stage) {
    77
    88  int i;
    99  Stack *stack;
    1010  Job *job;
     11  struct timeval now;
     12  float dtime;
    1113
    1214  if (logfile == NULL) {
     
    2830 
    2931  /* look for first NEEDHOST matching this host */
    30   for (i = 0; i < stack[0].Nobject; i++) {
     32  for (i = 0; (Stage == PCONTROL_JOB_STAGE_NEEDHOST) && (i < stack[0].Nobject); i++) {
    3133    job = (Job *) stack[0].object[i];
    3234    if (job[0].mode != PCONTROL_JOB_NEEDHOST) continue;
     
    4850
    4951  /* no NEEDHOST entry, look for first WANTHOST matching this host */
    50   for (i = 0; i < stack[0].Nobject; i++) {
     52  for (i = 0; (Stage == PCONTROL_JOB_STAGE_WANTHOST) && (i < stack[0].Nobject); i++) {
    5153    job = (Job *) stack[0].object[i];
    5254    if (job[0].mode != PCONTROL_JOB_WANTHOST) continue;
     
    6870
    6971  /* no WANTHOST entry, look for first ANYHOST matching this host */
    70   for (i = 0; i < stack[0].Nobject; i++) {
     72  for (i = 0; (Stage == PCONTROL_JOB_STAGE_ANYHOST) && (i < stack[0].Nobject); i++) {
    7173    job = (Job *) stack[0].object[i];
    7274    if (job[0].mode != PCONTROL_JOB_ANYHOST) continue;
     
    8688
    8789  /* no ANYHOST entry, look for first WANTHOST with old time */
    88   /* XXX perhaps I should add this to the conditions for ANYHOST instead of
    89      running a separate loop?  ie, WANTHOST && time > X == ANYHOST */
    90   for (i = 0; i < stack[0].Nobject; i++) {
     90  for (i = 0; (Stage == PCONTROL_JOB_STAGE_OLDWANT) && (i < stack[0].Nobject); i++) {
    9191    job = (Job *) stack[0].object[i];
    9292    if (job[0].mode != PCONTROL_JOB_WANTHOST) continue;
    93     // XXX test the job age and skip if too young
     93
     94    // allow WANT jobs to wait up to 10.0 sec for the host to be free before giving up
     95    gettimeofday (&now, (void *) NULL);
     96    dtime = DTIME (now, job[0].start);
     97    if (dtime > 10.0) continue;
    9498
    9599    if (logfile) fprintf (logfile, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r18098 r21379  
    2222
    2323  /* we want to give each block a maximum allowed time */
    24   CheckIdleHosts(0.020); /* submit a new job */
     24  CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_NEEDHOST); /* submit a new job */
     25  CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_WANTHOST); /* submit a new job */
     26  CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_ANYHOST);  /* submit a new job */
     27  CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_OLDWANT);  /* submit a new job */
    2528
    2629  CheckBusyJobs(0.020);  /* get job status */
     
    109112    if (RunLevel == PCONTROL_RUN_ALL) {
    110113      // we want to give each block a maximum allowed time
    111       Nhostchecks += CheckIdleHosts(0.020); /* submit a new job (PCLIENT) */
     114      Nhostchecks += CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_NEEDHOST); /* submit a new job (PCLIENT) */
     115      Nhostchecks += CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_WANTHOST); /* submit a new job (PCLIENT) */
     116      Nhostchecks += CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_ANYHOST); /* submit a new job (PCLIENT) */
     117      Nhostchecks += CheckIdleHosts(0.015, PCONTROL_JOB_STAGE_OLDWANT); /* submit a new job (PCLIENT) */
    112118      TestCheckPoint ();
    113119    }
     
    367373}
    368374
    369 int CheckIdleHosts (float MaxDelay) {
     375// if we have any IDLE hosts, check if there are jobs to be launched
     376// for each pass, we only check one type of job: stage = NEED, WANT, ANY, OLDWANT
     377int CheckIdleHosts (float MaxDelay, int Stage) {
    370378
    371379  struct timeval start, stop;
     
    389397    host = PullStackByLocation (stack, STACK_TOP);
    390398    if (host == NULL) break;
    391     CheckIdleHost (host);
    392     gettimeofday (&stop, (void *) NULL);
    393     dtime = DTIME (stop, start);
    394   }
     399    CheckIdleHost (host, Stage);
     400    gettimeofday (&stop, (void *) NULL);
     401    dtime = DTIME (stop, start);
     402  }
     403
    395404  if (DEBUG) gprint (GP_ERR, "checked %d hosts\n", i);
    396405  return (i);
  • trunk/Ohana/src/opihi/pcontrol/JobOps.c

    r20047 r21379  
    231231  PutJob (job, PCONTROL_JOB_PENDING, STACK_BOTTOM);
    232232
     233  // until the job is launched, we use 'start' to time how long the job is waiting on the queue
     234  gettimeofday (&job[0].start, (void *) NULL);
     235
    233236  if (VerboseMode()) gprint (GP_ERR, "added new job\n");
    234237  return (JobID);
  • trunk/Ohana/src/opihi/pcontrol/check.c

    r20047 r21379  
    33int check (int argc, char **argv) {
    44
     5  int N, Save;
    56  int JobID, HostID;
    67
     
    89  Job *job = NULL;
    910  Host *host = NULL;
     11
     12  Save = FALSE;
     13  if ((N = get_argument (argc, argv, "-save"))) {
     14    remove_argument (N, &argc, argv);
     15    Save = TRUE;
     16  }
    1017
    1118  if (argc != 3) {
     
    3946        gprint (GP_LOG, "HOSTNAME NONE\n");
    4047    }
     48
     49    if (Save) {
     50        set_str_variable ("JOB_STATUS", GetJobStackName(job[0].stack));
     51        set_int_variable ("JOB_EXITST", job[0].exit_status);
     52        set_int_variable ("JOB_STDOUT_SIZE", job[0].stdout.size);
     53        set_int_variable ("JOB_STDERR_SIZE", job[0].stderr.size);
     54        set_variable ("JOB_DTIME", job[0].dtime);
     55        set_str_variable ("JOB_HOSTNAME", job[0].hostname);
     56        if (job[0].realhost) {
     57            set_str_variable ("JOB_REALHOST", job[0].realhost);
     58        } else {
     59            set_str_variable ("JOB_REALHOST", "NONE");
     60        }
     61    }
     62
    4163    PushStack (stack, STACK_BOTTOM, job, job[0].JobID, job[0].argv[0]);
    4264    return (TRUE);
     
    5375    }
    5476    gprint (GP_LOG, "host %s\n", GetHostStackName(host[0].stack));
     77
     78    if (Save) {
     79        set_str_variable ("HOST_STATE", GetHostStackName(host[0].stack));
     80    }
     81
    5582    PushStack (stack, STACK_BOTTOM, host, host[0].HostID, host[0].hostname);
    5683    return (TRUE);
  • trunk/Ohana/src/opihi/pcontrol/status.c

    r18098 r21379  
    1717  Stack *stack;
    1818  Job *job;
     19  struct timeval now;
     20  float dtime;
    1921
    2022  stack = GetJobStack (Nstack);
     
    3436    }
    3537    gprint (GP_LOG, "%7s  ", GetJobStackName (job[0].state));
     38
     39    switch (job[0].state) {
     40        // for active jobs or pending jobs, print time since start (or create in the case of pending)
     41      case PCONTROL_JOB_PENDING:
     42      case PCONTROL_JOB_BUSY:
     43      case PCONTROL_JOB_RESP:
     44      case PCONTROL_JOB_HUNG:
     45        gettimeofday (&now, (void *) NULL);
     46        dtime = DTIME (now, job[0].start);
     47        gprint (GP_LOG, "%8.2f ", dtime);
     48        break;
     49
     50        // for active jobs or pending jobs, print time since start (or create in the case of pending)
     51      case PCONTROL_JOB_DONE:
     52      case PCONTROL_JOB_KILL:
     53      case PCONTROL_JOB_EXIT:
     54      case PCONTROL_JOB_CRASH:
     55      default:
     56        dtime = DTIME (job[0].stop, job[0].start);
     57        gprint (GP_LOG, "%8.2f ", dtime);
     58        break;
     59    }
     60
    3661    for (j = 0; j < job[0].argc; j++) {
    3762      gprint (GP_LOG, "%s ", job[0].argv[j]);
Note: See TracChangeset for help on using the changeset viewer.