IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 26411


Ignore:
Timestamp:
Dec 15, 2009, 3:42:06 PM (16 years ago)
Author:
eugene
Message:

add tracking of uniquely-named machines and feature to limit the number of unwanted-host jobs (and additional pantasks / pcontrol commands

Location:
trunk/Ohana/src/opihi
Files:
8 added
11 edited

Legend:

Unmodified
Added
Removed
  • trunk/Ohana/src/opihi/include/pcontrol.h

    r25872 r26411  
    9898  int          requested;
    9999} JobOutput;
     100
     101/* A machine has a unique name and may have multiple Hosts (each of which can run a single job)
     102   We use this to track aspects of the analysis per machine, to eg, limit the number of jobs
     103   desired on a single machine */
     104typedef struct {
     105  char *name;
     106  int Nhosts;                 // how many hosts are selected for this machine (whatever state)
     107  int NjobsRealhost;
     108  int NjobsWanthost;
     109} Machine;
    100110
    101111/* data to define a job */
     
    140150} Host;
    141151
     152/* the Jobs and Hosts are managed in a set of Stacks which define their state */
    142153typedef struct {
    143154  void **object;
     
    280291void   LinkJobAndHost (Job *job, Host *host);
    281292
     293/*** MachineOps.c ***/
     294void InitMachines ();
     295void FreeMachines ();
     296Machine *FindMachineByName (char *name);
     297Machine *AddMachine (char *name);
     298int DelMachine (char *name);
     299int AddMachineHost (Host *host);
     300int DelMachineHost (Host *host);
     301int AddMachineJob (Host *host, Job *job);
     302int DelMachineJob (Host *host, Job *job);
     303int PrintMachines ();
     304int CheckMachineJobs (Host *host, Job *job);
     305int GetMaxUnwantedHostJobs (void);
     306void SetMaxUnwantedHostJobs (int value);
     307
     308float GetMaxConnectTime (void);
     309void SetMaxConnectTime (float value);
     310float GetMaxWantHostWait (void);
     311void SetMaxWantHostWait (float value);
     312
    282313void pcontrol_exit (int n);
    283314
  • trunk/Ohana/src/opihi/pantasks/Makefile

    r23530 r26411  
    7373$(SRC)/controller_status.$(ARCH).o \
    7474$(SRC)/controller_jobstack.$(ARCH).o \
     75$(SRC)/controller_hoststack.$(ARCH).o \
     76$(SRC)/controller_machines.$(ARCH).o \
     77$(SRC)/controller_parameters.$(ARCH).o \
     78$(SRC)/controller_version.$(ARCH).o \
    7579$(SRC)/controller_verbose.$(ARCH).o \
    7680$(SRC)/controller_run.$(ARCH).o \
  • trunk/Ohana/src/opihi/pantasks/controller.c

    r23530 r26411  
    11# include "pantasks.h"
    22
    3 int controller_host     PROTO((int, char **));
    4 int controller_exit     PROTO((int, char **));
    5 int controller_status   PROTO((int, char **));
    6 int controller_jobstack PROTO((int, char **));
    7 int controller_verbose  PROTO((int, char **));
    8 int controller_run      PROTO((int, char **));
    9 int controller_stop     PROTO((int, char **));
    10 int controller_check    PROTO((int, char **));
    11 int controller_output   PROTO((int, char **));
    12 int controller_pulse    PROTO((int, char **));
     3int controller_host       PROTO((int, char **));
     4int controller_exit       PROTO((int, char **));
     5int controller_status     PROTO((int, char **));
     6int controller_jobstack   PROTO((int, char **));
     7int controller_hoststack  PROTO((int, char **));
     8int controller_machines   PROTO((int, char **));
     9int controller_parameters PROTO((int, char **));
     10int controller_verbose    PROTO((int, char **));
     11int controller_version    PROTO((int, char **));
     12int controller_run        PROTO((int, char **));
     13int controller_stop       PROTO((int, char **));
     14int controller_check      PROTO((int, char **));
     15int controller_output     PROTO((int, char **));
     16int controller_pulse      PROTO((int, char **));
    1317
    1418static Command controller_cmds[] = {
    15   {1, "exit",     controller_exit,     "shutdown controller"},
    16   {1, "host",     controller_host,     "define host for controller"},
    17   {1, "check",    controller_check,    "check controller host/job"},
    18   {1, "run",      controller_run,      "start controller operation / set run levels"},
    19   {1, "stop",     controller_run,      "stop controller (no disconnect)"},
    20   {1, "status",   controller_status,   "check controller status"},
    21   {1, "jobstack", controller_jobstack, "check controller status"},
    22   {1, "verbose",  controller_verbose,  "set controller verbosity"},
    23   {1, "output",   controller_output,   "print controller output"},
    24   {1, "pulse",    controller_pulse,    "set controller pulse"},
     19  {1, "check",     controller_check,      "check controller host/job"},
     20  // {1, "delete",    controller_delete,   "delete job on controller"},
     21  {1, "exit",      controller_exit,       "shutdown controller"},
     22  {1, "host",      controller_host,       "define host for controller"},
     23  {1, "hoststack", controller_hoststack,  "define host for controller"},
     24  // {1, "job",       controller_job,      "add jobs to controller"},
     25  {1, "jobstack",  controller_jobstack,   "check controller status"},
     26  {1, "machines",  controller_machines,   "print controller machine status"},
     27  {1, "parameters",controller_parameters, "modify controller parameters"},
     28  {1, "output",    controller_output,     "print controller output"},
     29  {1, "run",       controller_run,        "start controller operation / set run levels"},
     30  {1, "status",    controller_status,     "check controller status"},
     31  {1, "stop",      controller_run,        "stop controller (no disconnect)"},
     32  {1, "verbose",   controller_verbose,    "set controller verbosity"},
     33  {1, "version",   controller_version,    "show controller version"},
     34  {1, "pulse",     controller_pulse,      "set controller pulse"},
    2535};
    2636
  • trunk/Ohana/src/opihi/pcontrol/CheckDoneJob.c

    r20066 r26411  
    2828
    2929  if ((status1 == PCLIENT_DOWN) || (status2 == PCLIENT_DOWN)) {
     30
     31    // decrement the machine job-host counters
     32    DelMachineJob (host, job);
     33
    3034    // unlink host & job
    3135    if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname);
     
    5357  }
    5458
     59  // decrement the machine job-host counters
     60  DelMachineJob (host, job);
     61
    5562  /* job's state is either EXIT or CRASH (verify?) */
    5663  // unlink host & job
  • trunk/Ohana/src/opihi/pcontrol/CheckIdleHost.c

    r25872 r26411  
    11# include "pcontrol.h"
    22
    3 // The connection to the remote host is only allow to live for MAX_CONNECT_TIME seconds.  We
    4 // disconnect and reconnect if a remote host has been connected for too long.  This is a
    5 // (temporary?) work-around for the problem that the remote pclient job tends to grow too large
    6 // over time.
     3// The connection to the remote host is only allowed to live for MAX_CONNECT_TIME seconds.
     4// We disconnect and reconnect if a remote host has been connected for too long.  This is
     5// a (temporary?) work-around for the problem that the remote pclient job tends to grow
     6// too large over time.
    77
    8 # define MAX_CONNECT_TIME 36000.0
    9 
     8static float MAX_WANTHOST_WAIT = 10.0;
     9static float MAX_CONNECT_TIME = 36000.0;
    1010static FILE *logfile = NULL;
    1111
     
    7070
    7171    // if (logfile) fprintf (logfile, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     72    AddMachineJob (host, job);
    7273
    7374    /* take the job off the stack and unlock the stack */
     
    9091
    9192    // if (logfile) fprintf (logfile, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     93    AddMachineJob (host, job);
    9294
    9395    /* take the job off the stack and unlock the stack */
     
    103105    if (job[0].mode != PCONTROL_JOB_ANYHOST) continue;
    104106
     107    if (!CheckMachineJobs (host, job)) continue;
     108
    105109    /* we have found an appropriate job; link it to the host and send to StartJob */
    106110    job[0].host = (struct Host *) host;
     
    108112
    109113    // if (logfile) fprintf (logfile, "start  anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     114    AddMachineJob (host, job);
    110115
    111116    /* take the job off the stack and unlock the stack */
     
    124129    gettimeofday (&now, (void *) NULL);
    125130    dtime = DTIME (now, job[0].start);
    126     if (dtime < 10.0) continue;
     131    if (dtime < MAX_WANTHOST_WAIT) continue;
     132
     133    if (!CheckMachineJobs (host, job)) continue;
    127134
    128135    if (logfile) fprintf (logfile, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]);
     136    AddMachineJob (host, job);
    129137
    130138    /* we have found an appropriate job; link it to the host and send to StartJob */
     
    145153}
    146154
     155void SetMaxWantHostWait (float value) {
     156
     157  MAX_WANTHOST_WAIT = value;
     158  return;
     159}
     160
     161float GetMaxWantHostWait (void) {
     162
     163  return MAX_WANTHOST_WAIT;
     164}
     165
     166void SetMaxConnectTime (float value) {
     167
     168  MAX_CONNECT_TIME = value;
     169  return;
     170}
     171
     172float GetMaxConnectTime (void) {
     173
     174  return MAX_CONNECT_TIME;
     175}
     176
    147177/** note : host and job popped off IDLE and PENDING stacks,
    148178    unless no job is available **/
  • trunk/Ohana/src/opihi/pcontrol/CheckSystem.c

    r25872 r26411  
    122122    if (!Njobchecks && !Nhostchecks && (RunLevel != PCONTROL_RUN_NONE)) {
    123123      CheckLiveHosts(0.040);
     124      // fprintf (stderr, "sleep a bit\n");
    124125      usleep (100000); // idle if no jobs are waiting
    125126    } else {
    126127      // if we only have busy jobs, pause a moment before trying again
    127128      if (!Ndonejobs) {
     129        // fprintf (stderr, "sleep a bit\n");
    128130        usleep (100000);
    129131      }
  • trunk/Ohana/src/opihi/pcontrol/HostOps.c

    r25872 r26411  
    214214  host[0].job      = NULL;
    215215
     216  AddMachineHost (host);
     217
    216218  PutHost (host, PCONTROL_HOST_ALLHOSTS, STACK_BOTTOM);
    217219  PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
     
    225227  copy = PullStackByID (HostPool_AllHosts, host[0].HostID);
    226228  ASSERT (copy == host, "programming error: ALLHOSTS entry does not match");
     229
     230  DelMachineHost (host);
    227231
    228232  FreeIOBuffer (&host[0].comms_buffer);
  • trunk/Ohana/src/opihi/pcontrol/KillJob.c

    r17477 r26411  
    1919      job[0].host = NULL;
    2020      host[0].job = NULL;
     21
     22      // decrement the machine job-host counters
     23      DelMachineJob (host, job);
     24
    2125      HarvestHost (host[0].pid);
    2226      PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM);
     
    7478      job[0].host = NULL;
    7579      host[0].job = NULL;
     80
     81      // decrement the machine job-host counters
     82      DelMachineJob (host, job);
     83
    7684      PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM);
    7785      PutJob (job, PCONTROL_JOB_CRASH, STACK_BOTTOM);
  • trunk/Ohana/src/opihi/pcontrol/Makefile

    r17475 r26411  
    3737$(SRC)/IDops.$(ARCH).o \
    3838$(SRC)/JobOps.$(ARCH).o \
     39$(SRC)/MachineOps.$(ARCH).o \
    3940$(SRC)/StackOps.$(ARCH).o \
    4041$(SRC)/PclientCommand.$(ARCH).o \
     
    5354$(SRC)/kill.$(ARCH).o \
    5455$(SRC)/pulse.$(ARCH).o \
     56$(SRC)/parameters.$(ARCH).o \
    5557$(SRC)/run.$(ARCH).o \
     58$(SRC)/machines.$(ARCH).o \
    5659$(SRC)/status.$(ARCH).o \
    5760$(SRC)/stdout.$(ARCH).o \
  • trunk/Ohana/src/opihi/pcontrol/host.c

    r25872 r26411  
    9696usage:
    9797  gprint (GP_LOG, "USAGE: host (command) (hostname)\n");
    98   gprint (GP_ERR, "  valid commands: add, on, retry, check, off, delete\n");
    99   gprint (GP_ERR, "  -threads Nthreads is optional for 'add'\n");
     98  gprint (GP_LOG, "  valid commands: add, on, retry, check, off, delete\n");
     99  gprint (GP_LOG, "  -threads Nthreads is optional for 'add'\n");
    100100  return (FALSE);
    101101}
  • trunk/Ohana/src/opihi/pcontrol/init.c

    r16460 r26411  
    88int jobstack    PROTO((int, char **));
    99int kill_pc     PROTO((int, char **));
     10int machines    PROTO((int, char **));
     11int parameters  PROTO((int, char **));
     12int run         PROTO((int, char **));
    1013int status      PROTO((int, char **));
    11 int run         PROTO((int, char **));
    1214int stderr_pc   PROTO((int, char **));
    1315int stdout_pc   PROTO((int, char **));
     
    1921
    2022static Command cmds[] = { 
    21   {1, "host",      host,      "add / delete / modify host"},
    22   {1, "hoststack", hoststack, "list hosts for a single stack"},
    23   {1, "status",    status,    "get system status"},
    24   {1, "stop",      run,       "stop controller processing"},
    25   {1, "run",       run,       "set controller runlevel"},
    26   {1, "verbose",   verbose,   "set the verbose mode for job"},
    27   {1, "version",   version,   "show version information"},
    28   {1, "job",       job,       "add job"},
    29   {1, "jobstack",  jobstack,  "list jobs for a single stack"},
    30   {1, "check",     check,     "get job or host status"},
    31   {1, "delete",    delete,    "delete job"},
    32   {1, "kill",      kill_pc,   "kill job"},
    33   {1, "stderr",    stderr_pc, "get stderr buffer for job"},
    34   {1, "stdout",    stdout_pc, "get stdout buffer for job"},
    35 # ifndef THREADED
    36   {1, "pulse",     pulse,     "set system pulse"},
     23  {1, "check",      check,      "get job or host status"},
     24  {1, "delete",     delete,     "delete job"},
     25  {1, "host",       host,       "add / delete / modify host"},
     26  {1, "hoststack",  hoststack,  "list hosts for a single stack"},
     27  {1, "job",        job,        "add job"},
     28  {1, "jobstack",   jobstack,   "list jobs for a single stack"},
     29  {1, "kill",       kill_pc,    "kill job"},
     30  {1, "machines",   machines,   "list machines"},
     31  {1, "parameters", parameters, "get / set system parameters"},
     32  {1, "run",        run,        "set controller runlevel"},
     33  {1, "status",     status,     "get system status"},
     34  {1, "stderr",     stderr_pc,  "get stderr buffer for job"},
     35  {1, "stdout",     stdout_pc,  "get stdout buffer for job"},
     36  {1, "stop",       run,        "stop controller processing"},
     37  {1, "verbose",    verbose,    "set the verbose mode for job"},
     38  {1, "version",    version,    "show version information"},
     39# ifndef THREADED               
     40  {1, "pulse",      pulse,      "set system pulse"},
    3741# endif
    3842};
     
    4751  InitJobStacks ();
    4852  InitHostStacks ();
     53  InitMachines ();
    4954}
    5055
     
    5257  FreeJobStacks ();
    5358  FreeHostStacks ();
     59  FreeMachines ();
    5460}
Note: See TracChangeset for help on using the changeset viewer.