Changeset 26715
- Timestamp:
- Jan 28, 2010, 4:43:38 PM (16 years ago)
- Location:
- branches/eam_branches/20091201/Ohana/src/opihi
- Files:
-
- 12 edited
- 8 copied
-
include/pcontrol.h (modified) (3 diffs)
-
lib.shell/ListOps.c (modified) (2 diffs)
-
pantasks/Makefile (modified) (1 diff)
-
pantasks/controller.c (modified) (1 diff)
-
pantasks/controller_hoststack.c (copied) (copied from trunk/Ohana/src/opihi/pantasks/controller_hoststack.c )
-
pantasks/controller_machines.c (copied) (copied from trunk/Ohana/src/opihi/pantasks/controller_machines.c )
-
pantasks/controller_parameters.c (copied) (copied from trunk/Ohana/src/opihi/pantasks/controller_parameters.c )
-
pantasks/controller_version.c (copied) (copied from trunk/Ohana/src/opihi/pantasks/controller_version.c )
-
pcontrol/CheckDoneJob.c (modified) (2 diffs)
-
pcontrol/CheckIdleHost.c (modified) (7 diffs)
-
pcontrol/CheckSystem.c (modified) (1 diff)
-
pcontrol/HostOps.c (modified) (2 diffs)
-
pcontrol/KillJob.c (modified) (2 diffs)
-
pcontrol/MachineOps.c (copied) (copied from trunk/Ohana/src/opihi/pcontrol/MachineOps.c )
-
pcontrol/Makefile (modified) (2 diffs)
-
pcontrol/host.c (modified) (1 diff)
-
pcontrol/init.c (modified) (4 diffs)
-
pcontrol/machines.c (copied) (copied from trunk/Ohana/src/opihi/pcontrol/machines.c )
-
pcontrol/parameters.c (copied) (copied from trunk/Ohana/src/opihi/pcontrol/parameters.c )
-
pcontrol/test/machines.sh (copied) (copied from trunk/Ohana/src/opihi/pcontrol/test/machines.sh )
Legend:
- Unmodified
- Added
- Removed
-
branches/eam_branches/20091201/Ohana/src/opihi/include/pcontrol.h
r25872 r26715 98 98 int requested; 99 99 } JobOutput; 100 101 /* A machine has a unique name and may have multiple Hosts (each of which can run a single job) 102 We use this to track aspects of the analysis per machine, to eg, limit the number of jobs 103 desired on a single machine */ 104 typedef struct { 105 char *name; 106 int Nhosts; // how many hosts are selected for this machine (whatever state) 107 int NjobsRealhost; 108 int NjobsWanthost; 109 } Machine; 100 110 101 111 /* data to define a job */ … … 140 150 } Host; 141 151 152 /* the Jobs and Hosts are managed in a set of Stacks which define their state */ 142 153 typedef struct { 143 154 void **object; … … 280 291 void LinkJobAndHost (Job *job, Host *host); 281 292 293 /*** MachineOps.c ***/ 294 void InitMachines (); 295 void FreeMachines (); 296 Machine *FindMachineByName (char *name); 297 Machine *AddMachine (char *name); 298 int DelMachine (char *name); 299 int AddMachineHost (Host *host); 300 int DelMachineHost (Host *host); 301 int AddMachineJob (Host *host, Job *job); 302 int DelMachineJob (Host *host, Job *job); 303 int PrintMachines (); 304 int CheckMachineJobs (Host *host, Job *job); 305 int GetMaxUnwantedHostJobs (void); 306 void SetMaxUnwantedHostJobs (int value); 307 308 float GetMaxConnectTime (void); 309 void SetMaxConnectTime (float value); 310 float GetMaxWantHostWait (void); 311 void SetMaxWantHostWait (float value); 312 282 313 void pcontrol_exit (int n); 283 314 -
branches/eam_branches/20091201/Ohana/src/opihi/lib.shell/ListOps.c
r25965 r26715 3 3 4 4 /*** local static variables used to track the command lists ***/ 5 static List *lists ;/* variable to store the list of all lists */6 static int Nlists ;/* number of currently available lists */5 static List *lists = NULL; /* variable to store the list of all lists */ 6 static int Nlists = 0; /* number of currently available lists */ 7 7 8 8 void InitLists () { … … 16 16 int i, j; 17 17 18 for (i = 0; i < Nlists; i++) { 18 // Nlists is a bit weird: it is the currently highest valid list, not the number of lists 19 // Nlists = 0 is never allocated 20 for (i = 1; i < Nlists + 1; i++) { 19 21 for (j = 0; j < lists[i].Nlines; j++) { 20 22 free (lists[i].line[j]); -
branches/eam_branches/20091201/Ohana/src/opihi/pantasks/Makefile
r23530 r26715 73 73 $(SRC)/controller_status.$(ARCH).o \ 74 74 $(SRC)/controller_jobstack.$(ARCH).o \ 75 $(SRC)/controller_hoststack.$(ARCH).o \ 76 $(SRC)/controller_machines.$(ARCH).o \ 77 $(SRC)/controller_parameters.$(ARCH).o \ 78 $(SRC)/controller_version.$(ARCH).o \ 75 79 $(SRC)/controller_verbose.$(ARCH).o \ 76 80 $(SRC)/controller_run.$(ARCH).o \ -
branches/eam_branches/20091201/Ohana/src/opihi/pantasks/controller.c
r23530 r26715 1 1 # include "pantasks.h" 2 2 3 int controller_host PROTO((int, char **)); 4 int controller_exit PROTO((int, char **)); 5 int controller_status PROTO((int, char **)); 6 int controller_jobstack PROTO((int, char **)); 7 int controller_verbose PROTO((int, char **)); 8 int controller_run PROTO((int, char **)); 9 int controller_stop PROTO((int, char **)); 10 int controller_check PROTO((int, char **)); 11 int controller_output PROTO((int, char **)); 12 int controller_pulse PROTO((int, char **)); 3 int controller_host PROTO((int, char **)); 4 int controller_exit PROTO((int, char **)); 5 int controller_status PROTO((int, char **)); 6 int controller_jobstack PROTO((int, char **)); 7 int controller_hoststack PROTO((int, char **)); 8 int controller_machines PROTO((int, char **)); 9 int controller_parameters PROTO((int, char **)); 10 int controller_verbose PROTO((int, char **)); 11 int controller_version PROTO((int, char **)); 12 int controller_run PROTO((int, char **)); 13 int controller_stop PROTO((int, char **)); 14 int controller_check PROTO((int, char **)); 15 int controller_output PROTO((int, char **)); 16 int controller_pulse PROTO((int, char **)); 13 17 14 18 static Command controller_cmds[] = { 15 {1, "exit", controller_exit, "shutdown controller"}, 16 {1, "host", controller_host, "define host for controller"}, 17 {1, "check", controller_check, "check controller host/job"}, 18 {1, "run", controller_run, "start controller operation / set run levels"}, 19 {1, "stop", controller_run, "stop controller (no disconnect)"}, 20 {1, "status", controller_status, "check controller status"}, 21 {1, "jobstack", controller_jobstack, "check controller status"}, 22 {1, "verbose", controller_verbose, "set controller verbosity"}, 23 {1, "output", controller_output, "print controller output"}, 24 {1, "pulse", controller_pulse, "set controller pulse"}, 19 {1, "check", controller_check, "check controller host/job"}, 20 // {1, "delete", controller_delete, "delete job on controller"}, 21 {1, "exit", controller_exit, "shutdown controller"}, 22 {1, "host", controller_host, "define host for controller"}, 23 {1, "hoststack", controller_hoststack, "define host for controller"}, 24 // {1, "job", controller_job, "add jobs to controller"}, 25 {1, "jobstack", controller_jobstack, "check controller status"}, 26 {1, "machines", controller_machines, "print controller machine status"}, 27 {1, "parameters",controller_parameters, "modify controller parameters"}, 28 {1, "output", controller_output, "print controller output"}, 29 {1, "run", controller_run, "start controller operation / set run levels"}, 30 {1, "status", controller_status, "check controller status"}, 31 {1, "stop", controller_run, "stop controller (no disconnect)"}, 32 {1, "verbose", controller_verbose, "set controller verbosity"}, 33 {1, "version", controller_version, "show controller version"}, 34 {1, "pulse", controller_pulse, "set controller pulse"}, 25 35 }; 26 36 -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/CheckDoneJob.c
r20066 r26715 28 28 29 29 if ((status1 == PCLIENT_DOWN) || (status2 == PCLIENT_DOWN)) { 30 31 // decrement the machine job-host counters 32 DelMachineJob (host, job); 33 30 34 // unlink host & job 31 35 if (DEBUG || VerboseMode()) gprint (GP_ERR, "host %s is down\n", host[0].hostname); … … 53 57 } 54 58 59 // decrement the machine job-host counters 60 DelMachineJob (host, job); 61 55 62 /* job's state is either EXIT or CRASH (verify?) */ 56 63 // unlink host & job -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/CheckIdleHost.c
r25872 r26715 1 1 # include "pcontrol.h" 2 2 3 // The connection to the remote host is only allow to live for MAX_CONNECT_TIME seconds. We4 // disconnect and reconnect if a remote host has been connected for too long. This is a5 // (temporary?) work-around for the problem that the remote pclient job tends to grow too large6 // over time.3 // The connection to the remote host is only allowed to live for MAX_CONNECT_TIME seconds. 4 // We disconnect and reconnect if a remote host has been connected for too long. This is 5 // a (temporary?) work-around for the problem that the remote pclient job tends to grow 6 // too large over time. 7 7 8 # define MAX_CONNECT_TIME 36000.0 9 8 static float MAX_WANTHOST_WAIT = 10.0; 9 static float MAX_CONNECT_TIME = 36000.0; 10 10 static FILE *logfile = NULL; 11 11 … … 70 70 71 71 // if (logfile) fprintf (logfile, "start needhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 72 AddMachineJob (host, job); 72 73 73 74 /* take the job off the stack and unlock the stack */ … … 90 91 91 92 // if (logfile) fprintf (logfile, "start wanthost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 93 AddMachineJob (host, job); 92 94 93 95 /* take the job off the stack and unlock the stack */ … … 103 105 if (job[0].mode != PCONTROL_JOB_ANYHOST) continue; 104 106 107 if (!CheckMachineJobs (host, job)) continue; 108 105 109 /* we have found an appropriate job; link it to the host and send to StartJob */ 106 110 job[0].host = (struct Host *) host; … … 108 112 109 113 // if (logfile) fprintf (logfile, "start anyhost %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 114 AddMachineJob (host, job); 110 115 111 116 /* take the job off the stack and unlock the stack */ … … 124 129 gettimeofday (&now, (void *) NULL); 125 130 dtime = DTIME (now, job[0].start); 126 if (dtime < 10.0) continue; 131 if (dtime < MAX_WANTHOST_WAIT) continue; 132 133 if (!CheckMachineJobs (host, job)) continue; 127 134 128 135 if (logfile) fprintf (logfile, "start wanthost(2) %s (job host %s) : %s\n", host[0].hostname, job[0].hostname, job[0].argv[0]); 136 AddMachineJob (host, job); 129 137 130 138 /* we have found an appropriate job; link it to the host and send to StartJob */ … … 145 153 } 146 154 155 void SetMaxWantHostWait (float value) { 156 157 MAX_WANTHOST_WAIT = value; 158 return; 159 } 160 161 float GetMaxWantHostWait (void) { 162 163 return MAX_WANTHOST_WAIT; 164 } 165 166 void SetMaxConnectTime (float value) { 167 168 MAX_CONNECT_TIME = value; 169 return; 170 } 171 172 float GetMaxConnectTime (void) { 173 174 return MAX_CONNECT_TIME; 175 } 176 147 177 /** note : host and job popped off IDLE and PENDING stacks, 148 178 unless no job is available **/ -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/CheckSystem.c
r25872 r26715 122 122 if (!Njobchecks && !Nhostchecks && (RunLevel != PCONTROL_RUN_NONE)) { 123 123 CheckLiveHosts(0.040); 124 // fprintf (stderr, "sleep a bit\n"); 124 125 usleep (100000); // idle if no jobs are waiting 125 126 } else { 126 127 // if we only have busy jobs, pause a moment before trying again 127 128 if (!Ndonejobs) { 129 // fprintf (stderr, "sleep a bit\n"); 128 130 usleep (100000); 129 131 } -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/HostOps.c
r25872 r26715 214 214 host[0].job = NULL; 215 215 216 AddMachineHost (host); 217 216 218 PutHost (host, PCONTROL_HOST_ALLHOSTS, STACK_BOTTOM); 217 219 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); … … 225 227 copy = PullStackByID (HostPool_AllHosts, host[0].HostID); 226 228 ASSERT (copy == host, "programming error: ALLHOSTS entry does not match"); 229 230 DelMachineHost (host); 227 231 228 232 FreeIOBuffer (&host[0].comms_buffer); -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/KillJob.c
r17477 r26715 19 19 job[0].host = NULL; 20 20 host[0].job = NULL; 21 22 // decrement the machine job-host counters 23 DelMachineJob (host, job); 24 21 25 HarvestHost (host[0].pid); 22 26 PutHost (host, PCONTROL_HOST_DOWN, STACK_BOTTOM); … … 74 78 job[0].host = NULL; 75 79 host[0].job = NULL; 80 81 // decrement the machine job-host counters 82 DelMachineJob (host, job); 83 76 84 PutHost (host, PCONTROL_HOST_IDLE, STACK_BOTTOM); 77 85 PutJob (job, PCONTROL_JOB_CRASH, STACK_BOTTOM); -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/Makefile
r17475 r26715 37 37 $(SRC)/IDops.$(ARCH).o \ 38 38 $(SRC)/JobOps.$(ARCH).o \ 39 $(SRC)/MachineOps.$(ARCH).o \ 39 40 $(SRC)/StackOps.$(ARCH).o \ 40 41 $(SRC)/PclientCommand.$(ARCH).o \ … … 53 54 $(SRC)/kill.$(ARCH).o \ 54 55 $(SRC)/pulse.$(ARCH).o \ 56 $(SRC)/parameters.$(ARCH).o \ 55 57 $(SRC)/run.$(ARCH).o \ 58 $(SRC)/machines.$(ARCH).o \ 56 59 $(SRC)/status.$(ARCH).o \ 57 60 $(SRC)/stdout.$(ARCH).o \ -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/host.c
r25872 r26715 96 96 usage: 97 97 gprint (GP_LOG, "USAGE: host (command) (hostname)\n"); 98 gprint (GP_ ERR, " valid commands: add, on, retry, check, off, delete\n");99 gprint (GP_ ERR, " -threads Nthreads is optional for 'add'\n");98 gprint (GP_LOG, " valid commands: add, on, retry, check, off, delete\n"); 99 gprint (GP_LOG, " -threads Nthreads is optional for 'add'\n"); 100 100 return (FALSE); 101 101 } -
branches/eam_branches/20091201/Ohana/src/opihi/pcontrol/init.c
r16460 r26715 8 8 int jobstack PROTO((int, char **)); 9 9 int kill_pc PROTO((int, char **)); 10 int machines PROTO((int, char **)); 11 int parameters PROTO((int, char **)); 12 int run PROTO((int, char **)); 10 13 int status PROTO((int, char **)); 11 int run PROTO((int, char **));12 14 int stderr_pc PROTO((int, char **)); 13 15 int stdout_pc PROTO((int, char **)); … … 19 21 20 22 static Command cmds[] = { 21 {1, "host", host, "add / delete / modify host"}, 22 {1, "hoststack", hoststack, "list hosts for a single stack"}, 23 {1, "status", status, "get system status"}, 24 {1, "stop", run, "stop controller processing"}, 25 {1, "run", run, "set controller runlevel"}, 26 {1, "verbose", verbose, "set the verbose mode for job"}, 27 {1, "version", version, "show version information"}, 28 {1, "job", job, "add job"}, 29 {1, "jobstack", jobstack, "list jobs for a single stack"}, 30 {1, "check", check, "get job or host status"}, 31 {1, "delete", delete, "delete job"}, 32 {1, "kill", kill_pc, "kill job"}, 33 {1, "stderr", stderr_pc, "get stderr buffer for job"}, 34 {1, "stdout", stdout_pc, "get stdout buffer for job"}, 35 # ifndef THREADED 36 {1, "pulse", pulse, "set system pulse"}, 23 {1, "check", check, "get job or host status"}, 24 {1, "delete", delete, "delete job"}, 25 {1, "host", host, "add / delete / modify host"}, 26 {1, "hoststack", hoststack, "list hosts for a single stack"}, 27 {1, "job", job, "add job"}, 28 {1, "jobstack", jobstack, "list jobs for a single stack"}, 29 {1, "kill", kill_pc, "kill job"}, 30 {1, "machines", machines, "list machines"}, 31 {1, "parameters", parameters, "get / set system parameters"}, 32 {1, "run", run, "set controller runlevel"}, 33 {1, "status", status, "get system status"}, 34 {1, "stderr", stderr_pc, "get stderr buffer for job"}, 35 {1, "stdout", stdout_pc, "get stdout buffer for job"}, 36 {1, "stop", run, "stop controller processing"}, 37 {1, "verbose", verbose, "set the verbose mode for job"}, 38 {1, "version", version, "show version information"}, 39 # ifndef THREADED 40 {1, "pulse", pulse, "set system pulse"}, 37 41 # endif 38 42 }; … … 47 51 InitJobStacks (); 48 52 InitHostStacks (); 53 InitMachines (); 49 54 } 50 55 … … 52 57 FreeJobStacks (); 53 58 FreeHostStacks (); 59 FreeMachines (); 54 60 }
Note:
See TracChangeset
for help on using the changeset viewer.
