IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Changeset 30567


Ignore:
Timestamp:
Feb 11, 2011, 12:24:59 PM (15 years ago)
Author:
bills
Message:

Improve dependency checking. Stop trying to update components which fault multiple times.
Set the fault to the DO_NOT_REVERT value

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/PS-IPP-PStamp/lib/PS/IPP/PStamp/RequestFile.pm

    r30564 r30567  
    7575# NOTE: these must match the values in pstamp/src/pstamp.h
    7676our $PSTAMP_SUCCESS          = 0;
     77our $PSTAMP_FIRST_ERROR_CODE = 10;
    7778our $PSTAMP_SYSTEM_ERROR     = 10;
    7879our $PSTAMP_NOT_IMPLEMENTED  = 11;
  • trunk/ippTasks/pstamp.pro

    r30485 r30567  
    1818$pstampStopFaulted_DB = 0
    1919
     20# give up on dependents with fault_count >= $PSTAMP_MAX_FAULT_COUNT
     21$PSTAMP_MAX_FAULT_COUNT = 5
     22macro set.max.fault.count
     23    $PSTAMP_MAX_FAULT_COUNT = $1
     24end
     25macro get.max.fault.count
     26    echo maximum fault count: $PSTAMP_MAX_FAULT_COUNT
     27end
     28$POLL_DEP = 500
     29
     30macro set.dependent.poll
     31    $POLL_DEP = $1
     32end
     33macro get.dependent.poll
     34    echo dependent poll limit: $POLL_DEP
     35end
     36
    2037# set PS_DBSERVER if postage stamp database host is not the same as the value for DBSERVER in site.config
    2138# warning: no quotes around the two words. That causes the variable to get passed to pstamptool as one word
     
    137154    end
    138155end
     156
    139157macro pstamp.find.on
    140158    task pstamp.request.find
     
    143161end
    144162macro pstamp.find.off
    145     task pstamp.request.find.load
    146         active false
    147     end
    148 end
     163    task pstamp.request.find
     164        active false
     165    end
     166end
     167
     168macro pstamp.dependent.on
     169    task pstamp.dependent.load
     170        active true
     171    end
     172    task pstamp.dependent.run
     173        active true
     174    end
     175end
     176macro pstamp.dependent.off
     177    task pstamp.dependent.load
     178        active false
     179    end
     180    task pstamp.dependent.run
     181        active false
     182    end
     183end
     184
    149185
    150186macro pstamp.status.on
     
    158194    end
    159195end
     196
    160197macro pstamp.status.set.exec
    161198    task pstamp.save.status
     
    525562            echo pstamp.job.run task.exit $JOB_ID status: $JOB_STATUS
    526563        end
    527         showcommand failure
     564#        showcommand failure
    528565        process_exit pstampJob $options:0 $JOB_STATUS
    529566    end
     
    609646        add_poll_args run
    610647        add_poll_labels run
    611         command $run
     648        command $run -limit $POLL_DEP
    612649    end
    613650
     
    664701        book getword pstampDependent $pageName outdir     -var OUTDIR
    665702        book getword pstampDependent $pageName need_magic -var NEED_MAGIC
     703        book getword pstampDependent $pageName fault_count -var FAULT_COUNT
    666704        book getword pstampDependent $pageName dbname     -var DBNAME
    667705
     
    682720        stderr $MYLOGFILE
    683721
    684         $run = pstamp_checkdependent.pl --dep_id $DEP_ID --stage_id $STAGE_ID --stage $STAGE --component $COMPONENT --imagedb $IMAGEDB --rlabel $RLABEL $NEED_MAGIC
     722        $run = pstamp_checkdependent.pl --dep_id $DEP_ID --stage_id $STAGE_ID --stage $STAGE --component $COMPONENT --imagedb $IMAGEDB --rlabel $RLABEL $NEED_MAGIC --fault_count $FAULT_COUNT --max_fault_count $PSTAMP_MAX_FAULT_COUNT
    685723
    686724        add_standard_args run
     
    705743            echo pstamp.job.run task.exit $DEP_ID status: $JOB_STATUS
    706744        end
    707         showcommand failure
     745#        showcommand failure
    708746        process_exit pstampDependent $options:0 $JOB_STATUS
    709747    end
  • trunk/pstamp/scripts/pstamp_checkdependent.pl

    r30336 r30567  
    44#
    55# Check the status of a pending pstampDependent insuring that
    6 # the dependent processing has been queued and whether it is
    7 # finished or not
     6# the any update processing that is needed has been queued and determine check
     7# whether the processing has  finished or not
    88
    99use warnings;
     
    2727my $IPP_DIFF_MODE_STACK_STACK = 4;
    2828
    29 my ($dep_id, $stage, $stage_id, $component, $imagedb, $rlabel, $need_magic);
    30 my ($dbname, $dbserver, $verbose, $save_temps, $no_update);
     29my ($dep_id, $stage, $stage_id, $component, $imagedb, $rlabel, $need_magic, $fault_count, $max_fault_count);
     30my ($dbname, $ps_dbserver, $verbose, $save_temps, $no_update);
    3131
    3232GetOptions(
     
    3535    'stage_id=i'    =>  \$stage_id,
    3636    'component=s'   =>  \$component,
    37     'imagedb=s'     =>  \$imagedb,      # dbname for images.
     37    'imagedb=s'     =>  \$imagedb,      # dbname for images lookups.
    3838    'rlabel=s'      =>  \$rlabel,
    3939    'need_magic'    =>  \$need_magic,
     40    'fault_count=i' =>  \$fault_count,
     41    'max_fault_count=i' =>  \$max_fault_count,
    4042    'dbname=s'      =>  \$dbname,       # postage stamp server dbname
    41     'dbserver=s'    =>  \$dbserver,     # postage stamp server dbserver
     43    'dbserver=s'    =>  \$ps_dbserver,  # postage stamp server dbserver
    4244    'verbose'       =>  \$verbose,
    4345    'save-temps'    =>  \$save_temps,
     
    4850    if !(defined $dep_id and defined $stage and defined $stage_id and
    4951        defined $component and defined $imagedb);
     52
     53$max_fault_count = 5 if !$max_fault_count;
     54$fault_count = 0 if !defined $fault_count;
    5055
    5156my $missing_tools;
     
    6166}
    6267
     68my $ipprc = PS::IPP::Config->new();
     69
     70if (!$ps_dbserver) {
     71    $ps_dbserver =  metadataLookupStr($ipprc->{_siteConfig}, 'PS_DBSERVER');
     72}
     73$pstamptool  .= " -dbname $dbname" if $dbname;
     74$pstamptool  .= " -dbserver $ps_dbserver";
     75
    6376# Append imagedb to the ippTools
     77# Note: configured DBSERVER is used for this server
    6478$chiptool    .= " -dbname $imagedb";
    6579$warptool    .= " -dbname $imagedb";
     
    6882$magicdstool .= " -dbname $imagedb";
    6983
    70 my $ipprc = PS::IPP::Config->new();
    71 
    72 if (!$dbserver) {
    73     $dbserver =  metadataLookupStr($ipprc->{_siteConfig}, 'PS_DBSERVER');
    74 }
    75 
    7684
    7785my $tool;
    7886my $cmd;
    7987my $dsRun_state = "";
    80 my $whole_run = ($component eq 'all');
    81 
    82 # XXX: whole_run was a concept that isn't practical. Having one dependent for
    83 # a whole run makes finding errors too hard. We always have a dependent for each
    84 # component.
    85 # XXX: remove the unneeded code
    86 my_die("component = 'all' not supported", $PS_EXIT_PROG_ERROR) if $whole_run;;
    87 
    88 if ($whole_run) {
    89     if ($stage eq "chip") {
    90         $cmd = "$chiptool -listrun -chip_id $stage_id";
    91     } elsif ($stage eq "warp") {
    92         $cmd = "$warptool -listrun -warp_id $stage_id";
    93     } elsif ($stage eq "diff") {
    94         $cmd = "$difftool -listrun -diff_id $stage_id";
    95     } else {
    96         my_die("unexpected stage $stage found", $PS_EXIT_PROG_ERROR);
    97     }
     88
     89if ($stage eq "chip") {
     90    $cmd = "$chiptool -processedimfile -allfiles -chip_id $stage_id -class_id $component";
     91} elsif ($stage eq "warp") {
     92    $cmd = "$warptool -warped -warp_id $stage_id -skycell_id $component";
     93} elsif ($stage eq "diff") {
     94    $cmd = "$difftool -diffskyfile -diff_id $stage_id -skycell_id $component";
    9895} else {
    99     if ($stage eq "chip") {
    100         $cmd = "$chiptool -processedimfile -allfiles -chip_id $stage_id -class_id $component";
    101     } elsif ($stage eq "warp") {
    102         $cmd = "$warptool -warped -warp_id $stage_id -skycell_id $component";
    103     } elsif ($stage eq "diff") {
    104         $cmd = "$difftool -diffskyfile -diff_id $stage_id -skycell_id $component";
    105     } else {
    106         my_die("unexpected stage $stage found", $PS_EXIT_PROG_ERROR);
    107     }
     96    my_die("unexpected stage $stage found", $PS_EXIT_PROG_ERROR);
    10897}
    10998
     
    124113}
    125114my $status = 0;
    126 if (($it->{state} eq 'full') or ($it->{state} eq 'update') and ($whole_run or ($it->{data_state} eq 'full'))
     115if ((($it->{state} eq 'full') or ($it->{state} eq 'update')) and ($it->{data_state} eq 'full')
    127116        and (!$need_magic or $magic_ok or $it->{magicked} > 0)) {
    128117
    129     # This Dependency is satisfied. All done!
     118    # This Dependency is satisfied. All done. Release the pstampJobs
     119    #
    130120    my $command = "$pstamptool -updatedependent -set_state full -dep_id $dep_id";
    131     $command .= " -dbname $dbname" if $dbname;
    132     $command .= " -dbserver $dbserver" if $dbserver;
    133121    if (!$no_update) {
    134122        my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
     
    143131} elsif (($it->{state} eq 'cleaned') or ($it->{state} eq 'update')) {
    144132    #       For warp and diff stages we need to call the 'queue_update' subroutines even if the
    145     #       data_state is update in order to check the earlier stages in the pipeline
    146     #       For example if warpSkyfile is in update state but the chip run that it depends on hasn't
    147     #       been updated we need to go and queue it.
    148 
    149     if (($it->{state} ne 'cleaned') and $it->{fault}) {
    150         my_die("Component faulted on update dep_id: $dep_id",
    151                 $PS_EXIT_SYS_ERROR);
     133    #       data_state is update in order to check the state of inputs in earlier stages in the pipeline
     134    #       For example if warpSkyfile is in update state but the chipRun that it depends on hasn't
     135    #       been updated we need to go and queue the chips for processing.
     136
     137    my $fault = $it->{fault};
     138    if (($fault eq $PSTAMP_GONE) or (($it->{state} eq 'update') and $fault)) {
     139        $fault_count++;
     140        print "$stage $stage_id $component has fault $fault\n";
     141        if ($it->{fault} eq $PSTAMP_GONE) {
     142            faultJobs($PSTAMP_GONE);
     143            exit 0;
     144        } elsif ($fault_count >= $max_fault_count) {
     145            print "$stage $stage_id $component has faulted $fault_count times. Giving up\n";
     146
     147            faultComponent($stage, $stage_id, $component, $PSTAMP_GONE);
     148
     149            # fault the jobs
     150            faultJobs($PSTAMP_GONE);
     151            exit 0;
     152        }
     153
     154        # assume the fault is transient.
     155        my_die("Component faulted on update dep_id: $dep_id", $PS_EXIT_SYS_ERROR);
    152156    }
    153157
    154158    if ($stage eq 'chip') {
    155         # check_states_chip takes an array so that check_states_warp can pass it set of chips
     159        # check_states_chip takes an array so that check_states_warp can pass it a set of chips
    156160        my $chips = [$it];
    157         $status = check_states_chip($it->{chip_id}, $whole_run, $chips, $rlabel, $need_magic);
     161        $status = check_states_chip($it->{chip_id}, $chips, $rlabel, $need_magic);
    158162    } elsif ($stage eq 'warp') {
    159         $status = check_states_warp($it, $whole_run, $rlabel, $need_magic);
     163        $status = check_states_warp($it, $rlabel, $need_magic);
    160164    } elsif ($stage eq 'diff') {
    161         $status = check_states_diff($it, $whole_run, $rlabel, $need_magic);
     165        $status = check_states_diff($it, $rlabel, $need_magic);
    162166    } else {
    163167        my_die("Unexpected stage found $stage", $PS_EXIT_PROG_ERROR);
     168    }
     169    if ($status >= $PSTAMP_FIRST_ERROR_CODE) {
     170        faultJobs($status);
    164171    }
    165172} else {
     
    173180    my $job_fault = 0;
    174181
    175 if (0) {
    176     if ($stage eq 'chip') {
    177         # XXX: There is no need to check this anymore. All magicked chipRuns have abs(burntool_state) >= 13
    178         # If something changes that causes an error, we will figure that out from the chip failure
    179         my $burntool_state = $it->{burntool_state};
    180         # XXX: get the value of 13 from the ppImage recipe
    181         if ($burntool_state and (abs($burntool_state) < 13)) {
    182             print STDERR "chip $it->{chip_id} $it->{class_id} has burntool_state $burntool_state. Not avaiable.\n";
    183             $job_fault = $PSTAMP_NOT_AVAILABLE;
    184         }
    185     }
    186 }
    187182    if ($state eq 'error_cleaned') {
    188183        $job_fault = $PSTAMP_NOT_AVAILABLE;
    189     } elsif (($state =~ /scrub/) or ($state =~ /purge/)) {
    190         # jobs must have changed state since depenency was made
     184    } elsif (($state =~ /scrub/) or ($state =~ /purge/) or ($state eq 'drop')) {
     185        # Component state must have been changed state since dependency was inserted.
    191186        print STDERR "Dependency cannot be satisfied\n";
    192187        $job_fault = $PSTAMP_GONE;
     
    195190        my_die ("Unexpected state for ${stage}Run $stage_id $state", $PS_EXIT_PROG_ERROR);
    196191    }
     192
    197193    if (!$job_fault and ($stage eq 'chip')) {
     194        # should only get here with data_state 'full' and perhaps destreaking not done
     195        my_die ("Unexpected state for ${stage}Run $stage_id $state", $PS_EXIT_PROG_ERROR)
     196            if $it->{data_state} ne 'full';
     197
    198198        # chip processing is done, start destreaking.
    199199        my @chips;
     
    201201        $job_fault = check_states_magicDSRun($stage, $stage_id, \@chips, $rlabel, $need_magic, $it->{raw_magicked}, $it->{magic_ds_id}, $it->{dsRun_state});
    202202    }
    203     if ($job_fault) {
    204         faultJobs($state, $stage, $stage_id, $job_fault);
     203
     204    if ($job_fault >= $PSTAMP_FIRST_ERROR_CODE) {
     205        faultJobs($job_fault);
    205206    }
    206207}
     
    211212sub check_states_chip {
    212213    my $chip_id = shift;
    213     my $whole_run = shift;  # if true queue entire run for update
    214     my $metadatas = shift;  # an array of hashes, either from -processedimfile or -listrun
     214    my $metadatas = shift;  # an array of hashes, either from chiptool -processedimfile or warptool -scmap
    215215    my $rlabel = shift;     # if defined a new label for the chipRun
    216216    my $need_magic = shift;
    217 
    218217
    219218    my $dsRun_state;
     
    221220    my @chips;
    222221    my $magic_ds_id;
    223     if (!$whole_run) {
    224         foreach my $chip (@$metadatas) {
    225             $dsRun_state = $chip->{dsRun_state};
    226             $raw_all_magicked &= ($chip->{raw_magicked} > 0);
    227             $magic_ds_id = $chip->{magic_ds_id};
    228 
    229             push @chips, $chip->{class_id};
    230 
    231             my $state = $chip->{state};
    232             my $data_state = $chip->{data_state};
    233             if (($state =~ /error/) or ($state =~ /purged/) or ($state =~ /scrubbed/) or ($state eq 'drop') or
    234                 ($data_state =~ /error/) or ($data_state =~ /purged/) or ($data_state =~ /scrubbed/)) {
    235 
    236                 print "chipRun state is $chip->{chip_id} has state: $state data_state: $data_state cannot update\n";
    237                 faultJobs('stop', undef, undef, $PSTAMP_GONE);
    238 
    239                 return 0;
    240             } elsif (($chip->{data_state} ne 'update') and ($chip->{data_state} ne 'full')) {
    241 
    242                 my $command = "$chiptool -setimfiletoupdate -chip_id $chip_id -class_id $chip->{class_id}";
    243                 $command .= " -set_label $rlabel" if $rlabel;
    244 
    245                 if (!$no_update) {
    246                     my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
    247                                 run(command => $command, verbose => $verbose);
    248                     unless ($success) {
    249                         my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
    250                     }
    251                 } else {
    252                     print "skipping $command\n";
    253                 }
    254             } elsif ($chip->{fault}) {
    255                 # fault the dependent
    256                 my_die("chip $chip->{chip_id} $chip->{class_id} faulted: $chip->{fault}", $chip->{fault});
    257             } elsif ($chip->{dsFile_fault}) {
    258                 # fault the dependent
    259                 my_die("magicDSFile $chip->{magic_ds_id} $chip->{chip_id} $chip->{class_id} faulted: $chip->{dsFile_fault}", $chip->{dsFile_fault});
    260             }
    261         }
    262     } else {
    263         my $run = $metadatas->[0];
    264         $dsRun_state = $run->{dsRun_state};
    265         $raw_all_magicked = ($run->{raw_magicked} > 0);
    266         my $state = $run->{state};
    267         if (($state =~ /error/) or ($state =~ /purged/) or ($state =~ /scrubbed/)) {
    268             print "chipRun state is $run->{chip_id} is in state $state cannot update\n";
    269             faultJobs('stop', undef, undef, $PSTAMP_GONE);
    270             return 0;
    271         }
    272 
    273         # providing no -class_id arguments changes all imfiles with data_state = 'cleaned' to 'update'
    274         my $command = "$chiptool -setimfiletoupdate -chip_id $chip_id";
    275         $command .= " -set_label $rlabel" if $rlabel;
    276 
    277         if (!$no_update) {
    278             my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
    279                         run(command => $command, verbose => $verbose);
    280             unless ($success) {
    281                 my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
    282             }
    283         } else {
    284             print "skipping $command\n";
    285         }
    286     }
    287 
    288     my $status = check_states_magicDSRun('chip', $chip_id, \@chips, $rlabel, $need_magic, $raw_all_magicked, $magic_ds_id, $dsRun_state);
    289 
    290     return $status;
    291 }
    292 
    293 sub check_states_warp {
    294     # check status of input chips. If state is not updatable set error code for job
    295 
    296     # if chipProcessedImfile.state is cleaned call check_states_chip
    297 
    298     my $metadata = shift;
    299     my $whole_run = shift;  # if true queue entire run for update
    300     my $rlabel = shift;     # if defined a new label for the chipRun
    301     my $need_magic = shift;
    302 
    303     my $raw_all_magicked = 1; # this gets cleared if any of the inputs aren't destreaked
    304 
    305     my $warp_id = $metadata->{warp_id};
    306     my $skycell_id = $metadata->{skycell_id};
    307     my $state = $metadata->{state};
    308     my $data_state = $metadata->{data_state};
    309     if (($state =~ /error/) or ($state =~ /purged/) or ($state =~ /scrubbed/) or ($state eq 'drop') or
    310          ($data_state =~ /error/) or ($data_state =~ /purged/) or ($data_state =~ /scrubbed/)) {
    311         print STDERR "warpRun $warp_id $skycell_id has state $state $data_state faulting jobs\n";
    312         faultJobs('stop', undef, undef, $PSTAMP_GONE);
    313         exit 0;
    314     }
    315     if (($state eq 'update') and ($metadata->{fault})) {
    316         # fault dependent.
    317         my $fault = $metadata->{fault};
    318         my_die("warp $warp_id $skycell_id faulted: $fault", $fault);
    319     }
    320 
    321     if (!$whole_run) {
    322         my $skycell = $metadata;
    323         my $skycell_id = $skycell->{skycell_id};
    324 
    325         my $command = "$warptool -scmap -warp_id $warp_id -skycell_id $skycell_id";
    326         my $data = runToolAndParse($command, $verbose);
    327         if (!$data or scalar @$data == 0) {
    328             # This happens if the chipProcessedImfile disappears which happened when earlier
    329             # versions of chiptool -revertprocessedimfile didn't check the chipRun.state before
    330             # deleing the row.
    331             # Fault the jobs so that the Request can finish ...
    332             faultJobs('stop', undef, undef, $PSTAMP_GONE);
    333             # ... and fault the dependent so that we have a record of the error
    334             my_die("failed to find warpSkyCelllMap for warpRun $warp_id skycell_id $skycell_id",
    335                 $PSTAMP_GONE);
    336         }
    337 
    338 
    339         my $chips_ready = 1;
    340         my @chipsToUpdate;
    341         my $chip_id;
    342         foreach my $chip (@$data) {
    343             $chip_id = $chip->{chip_id};
    344             if (($chip->{data_state} ne 'full') or ($need_magic and ($chip->{magicked} <= 0))) {
    345                 $chips_ready = 0;
    346                 $chip->{fault} = $chip->{chip_fault};
    347                 push @chipsToUpdate, $chip;
     222
     223    my $queued_update = 0;
     224    foreach my $chip (@$metadatas) {
     225        $dsRun_state = $chip->{dsRun_state};
     226        $raw_all_magicked &= ($chip->{raw_magicked} > 0);
     227        $magic_ds_id = $chip->{magic_ds_id};
     228
     229        push @chips, $chip->{class_id};
     230
     231        my $state = $chip->{state};
     232        my $data_state = $chip->{data_state};
     233        if (($state =~ /error/) or ($state =~ /purged/) or ($state =~ /scrubbed/) or ($state eq 'drop') or
     234            ($data_state =~ /error/) or ($data_state =~ /purged/) or ($data_state =~ /scrubbed/) or ($data_state eq 'drop')) {
     235
     236            print "chipRun state is $chip->{chip_id} has state: $state data_state: $data_state cannot update\n";
     237            my $error_code;
     238            if (($state eq 'error_cleaned') or ($data_state ='error_cleaned')) {
     239                $error_code = $PSTAMP_NOT_AVAILABLE;
    348240            } else {
    349                 # this chip is good to go
    350             }
    351         }
    352 
    353         if ($chips_ready and $skycell->{data_state} ne 'update') {
    354             # the reason we defer setting the warp to update is so that we can handle error conditions at previous
    355             # stages more easily.
    356             my $command = "$warptool -setskyfiletoupdate -warp_id $warp_id -skycell_id $skycell->{skycell_id}";
     241                $error_code = $PSTAMP_GONE;
     242            }
     243           
     244            # caller will fault the jobs
     245            return $error_code;
     246        } elsif (($chip->{data_state} ne 'update') and ($chip->{data_state} ne 'full')) {
     247
     248            # chiptool does more state checking to insure this isn't done prematurely.
     249            my $command = "$chiptool -setimfiletoupdate -chip_id $chip_id -class_id $chip->{class_id}";
    357250            $command .= " -set_label $rlabel" if $rlabel;
    358251
     
    366259                print "skipping $command\n";
    367260            }
    368         } elsif (scalar @chipsToUpdate > 0) {
    369             return check_states_chip($chip_id, 0, \@chipsToUpdate, $rlabel, $need_magic);
    370         }
    371     } else {
    372         my $warpRun = $metadata;
    373         my $command = "$chiptool -listrun -chip_id $warpRun->{chip_id}";
    374         my $data = runToolAndParse($command, $verbose);
    375         my_die("failed to find chipRun $warpRun->{chip_id} for warpRun $warp_id", $PS_EXIT_UNKNOWN_ERROR)
    376             if !$data or scalar @$data != 1;
    377 
    378         my $chipRun = $data->[0];
    379 
    380         my $chipRunState = $chipRun->{state};
    381         if (($chipRunState =~ /purge/) or ($chipRunState =~ /scrub/)) {
    382             print STDERR "warpRun $warp_id depends on chipRun $chipRun->{chip_id} which is in state $chipRunState\n";
    383             faultJobs('stop', 'warp', $warp_id, $PSTAMP_GONE);
    384             return 0;
    385         }
    386         my $warpRunState = $warpRun->{state};
    387         if (($chipRunState eq 'full') and (! $need_magic or ($chipRun->{magicked} > 0)) and ($warpRunState eq 'cleaned')) {
    388             # The inputs and outputs are ready. Queue the warpRun for update.
    389 
    390             # providing no -skycell_id arguments changes all skyfiles with data_state = 'cleaned' to 'update'
    391             my $command = "$warptool -setskyfiletoupdate -warp_id $warp_id";
    392             $command .= " -set_label $rlabel" if $rlabel;
    393 
    394             if (!$no_update) {
    395                 my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
    396                             run(command => $command, verbose => $verbose);
    397                 unless ($success) {
    398                     my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
    399                 }
    400             } else {
    401                 print "skipping $command\n";
    402             }
    403         } elsif ($chipRunState eq 'cleaned' or
    404             (($chipRun->{state} eq 'full') and ($need_magic and ($chipRun->{magicked} < 0) and ($chipRun->{dsRun_state} ne 'new')))) {
    405             my $data = [$chipRun];
    406             return check_states_chip($chipRun->{chip_id}, 1, $data, $rlabel, $need_magic);
    407         }
    408     }
    409 
    410     # return value may be the return status of script so zero is good
    411     return 0;
    412 }
     261            $queued_update = 1;
     262        } elsif ($chip->{fault}) {
     263            $fault_count++;
     264            my $fault =  $chip->{fault};
     265
     266            if ($fault eq $PSTAMP_GONE) {
     267                # caller will fault jobs
     268                return $PSTAMP_GONE;
     269            } elsif ($fault_count > $max_fault_count) {
     270                print "$stage $stage_id has faulted $fault_count times. Giving up\n";
     271                $fault = $PSTAMP_GONE;
     272                faultComponent('chip', $chip->{chip_id}, $chip->{class_id}, $PSTAMP_GONE);
     273                return $PSTAMP_GONE;
     274            }
     275            # fault the dependent
     276            my_die("chip $chip->{chip_id} $chip->{class_id} faulted: $chip->{fault}", $chip->{fault});
     277        } elsif ($chip->{dsFile_fault} eq $PSTAMP_GONE) {
     278            print STDERR "magicDSFile $chip->{magic_ds_id} $chip->{chip_id} $chip->{class_id} is GONE";
     279            return $PSTAMP_GONE;
     280        } elsif ($chip->{dsFile_fault} and ($chip->{dsFile_data_state} eq 'update')) {
     281            # fault the dependent
     282            my_die("magicDSFile $chip->{magic_ds_id} $chip->{chip_id} $chip->{class_id} faulted: $chip->{dsFile_fault}", $chip->{dsFile_fault});
     283        }
     284    }
     285
     286    my $status = 0;
     287    if (!$queued_update) {
     288        $status = check_states_magicDSRun('chip', $chip_id, \@chips, $rlabel, $need_magic, $raw_all_magicked, $magic_ds_id, $dsRun_state);
     289    }
     290
     291    return $status;
     292}
     293
     294sub check_states_warp {
     295    my $metadata = shift;
     296    my $rlabel = shift;     # if defined a new label for the chipRun
     297    my $need_magic = shift;
     298   
     299    my $exit_status = 0;
     300
     301    my $raw_all_magicked = 1; # this gets cleared if any of the inputs aren't destreaked
     302
     303    my $warp_id = $metadata->{warp_id};
     304    my $skycell_id = $metadata->{skycell_id};
     305    my $state = $metadata->{state};
     306    my $data_state = $metadata->{data_state};
     307
     308    if (($state =~ /error/) or ($state =~ /purged/) or ($state =~ /scrubbed/) or ($state eq 'drop') or
     309         ($data_state =~ /error/) or ($data_state =~ /purged/) or ($data_state =~ /scrubbed/) or ($data_state eq 'drop')) {
     310        print STDERR "warpRun $warp_id $skycell_id has state $state $data_state faulting jobs\n";
     311        my $error_code;
     312        if (($state eq 'error_cleaned') or ($data_state eq 'error_cleaned')) {
     313            $error_code = $PSTAMP_NOT_AVAILABLE;
     314        } else {
     315            $error_code = $PSTAMP_GONE;
     316        }
     317        return $error_code
     318    }
     319    if (($state eq 'update') and ($metadata->{fault})) {
     320        # fault dependent.
     321        my $fault = $metadata->{fault};
     322        print STDERR "warp $warp_id $skycell_id faulted: $fault";
     323        return $fault;
     324    }
     325
     326    my $skycell = $metadata;
     327
     328    # get the list of input chips for this skycell
     329    my $command = "$warptool -scmap -warp_id $warp_id -skycell_id $skycell_id";
     330    my $data = runToolAndParse($command, $verbose);
     331    if (!$data or scalar @$data == 0) {
     332        # This happens if the chipProcessedImfile disappears which happened when earlier
     333        # versions of chiptool -revertprocessedimfile didn't check the chipRun.state before
     334        # deleting the row.
     335        print STDERR "failed to find warpSkyCelllMap for warpRun $warp_id skycell_id $skycell_id";
     336        return $PSTAMP_GONE;
     337    }
     338
     339    my $chips_ready = 1;
     340    my @chipsToUpdate;
     341    my $chip_id;
     342    foreach my $chip (@$data) {
     343        $chip_id = $chip->{chip_id};
     344        if (($chip->{data_state} ne 'full') or ($need_magic and ($chip->{magicked} <= 0))) {
     345            $chips_ready = 0;
     346            $chip->{fault} = $chip->{chip_fault};
     347            push @chipsToUpdate, $chip;
     348        } else {
     349            # this chip is done
     350        }
     351    }
     352
     353    if ($chips_ready and $skycell->{data_state} ne 'update') {
     354        # the reason we defer setting the warp to update is so that we can handle error conditions at previous
     355        # stages more easily.
     356        my $command = "$warptool -setskyfiletoupdate -warp_id $warp_id -skycell_id $skycell->{skycell_id}";
     357        $command .= " -set_label $rlabel" if $rlabel;
     358
     359        if (!$no_update) {
     360            my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
     361                        run(command => $command, verbose => $verbose);
     362            unless ($success) {
     363                my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
     364            }
     365        } else {
     366            print "skipping $command\n";
     367        }
     368    } elsif (scalar @chipsToUpdate > 0) {
     369        my $fault = check_states_chip($chip_id, \@chipsToUpdate, $rlabel, $need_magic);
     370        if ($fault) {
     371            if ($fault eq $PSTAMP_GONE) {
     372                # chip or dsfile that this skycell depends on has faulted in a way that is not recoverable
     373                # fault the skycell
     374                faultComponent('warp', $warp_id, $skycell->{skycell_id}, $PSTAMP_GONE);
     375            }
     376            $exit_status = $fault;
     377        }
     378    }
     379
     380    # return value may be used as the return status of script so we use zero as success
     381    return $exit_status;
     382}
     383
    413384sub check_states_diff {
    414385    my $metadata = shift;
    415     my $whole_run = shift;  # if true queue entire run for update
    416386    my $rlabel = shift;     # if defined a new label for the chipRun
    417387    my $need_magic = shift;
     
    419389    my $diff_id   = $metadata->{diff_id};
    420390    my $diff_mode = $metadata->{diff_mode};
    421     if (!$whole_run) {
    422         my $skycell = $metadata;
    423         my $skycell_id = $skycell->{skycell_id};
    424 
    425         if ($diff_mode == $IPP_DIFF_MODE_WARP_STACK ) {
    426             # check the state of the template stack
    427             my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
    428             my $stack = runToolAndParseExpectOne($command, $verbose);
    429             my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack;
    430 
    431             if ($stack->{state} ne 'full') {
    432                 print STDERR "template stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
    433                 # this faults all jobs depending on this dep_id
    434                 faultJobs('stop', 'diff', $diff_id, $PSTAMP_GONE);
    435                 return $PSTAMP_GONE;
    436             }
    437 
    438             # now check the warp
    439             $command = "$warptool -warped -warp_id $skycell->{warp1} -skycell_id $skycell_id";
    440             my $warp = runToolAndParseExpectOne($command, $verbose);
    441             my_die("failed to find warpSkyfile for warpRun $skycell->{warp1} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp;
    442 
    443             if ($warp->{data_state} ne 'full') {
    444                 return check_states_warp($warp, 0, $rlabel, $need_magic);
    445             }
    446             # warps are ready fall through and queue the diff update
    447         } elsif ($diff_mode eq $IPP_DIFF_MODE_WARP_WARP) {
    448             my $command = "$warptool -warped -warp_id $skycell->{warp1} -skycell_id $skycell_id";
    449             my $warp1 = runToolAndParseExpectOne($command, $verbose);
    450             my_die("failed to find warpSkyfile for warpRun $skycell->{warp1} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp1;
    451 
    452             my $warps_ready = 1;
    453             my $status = 0;
    454             if ($warp1->{data_state} ne 'full') {
    455                 $warps_ready = 0;
    456                 $status = check_states_warp($warp1, 0, $rlabel, $need_magic);
    457                 if ($status) {
    458                     return $status;
     391    my $skycell = $metadata;
     392    my $skycell_id = $skycell->{skycell_id};
     393
     394    if ($diff_mode == $IPP_DIFF_MODE_WARP_STACK ) {
     395        # check the state of the template stack
     396        my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
     397        my $stack = runToolAndParseExpectOne($command, $verbose);
     398        my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack;
     399
     400        if ($stack->{state} ne 'full') {
     401            print STDERR "template stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
     402            faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     403            return $PSTAMP_GONE;
     404        }
     405
     406        # now check the warp
     407        $command = "$warptool -warped -warp_id $skycell->{warp1} -skycell_id $skycell_id";
     408        my $warp = runToolAndParseExpectOne($command, $verbose);
     409        my_die("failed to find warpSkyfile for warpRun $skycell->{warp1} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp;
     410
     411        if ($warp->{data_state} ne 'full') {
     412            my $warp_status = check_states_warp($warp, 0, $rlabel, $need_magic);
     413            if ($warp_status eq $PSTAMP_GONE) {
     414                faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     415            }
     416            return $warp_status;
     417        }
     418        # warps are ready fall through and queue the diff update
     419    } elsif ($diff_mode eq $IPP_DIFF_MODE_WARP_WARP) {
     420        my $command = "$warptool -warped -warp_id $skycell->{warp1} -skycell_id $skycell_id";
     421        my $warp1 = runToolAndParseExpectOne($command, $verbose);
     422        my_die("failed to find warpSkyfile for warpRun $skycell->{warp1} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp1;
     423
     424        my $warps_ready = 1;
     425        my $warp_status = 0;
     426        if ($warp1->{data_state} ne 'full') {
     427            $warps_ready = 0;
     428            $warp_status = check_states_warp($warp1, 0, $rlabel, $need_magic);
     429            if ($warp_status) {
     430                if ($warp_status eq $PSTAMP_GONE) {
     431                    faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
    459432                }
    460             }
    461             $command = "$warptool -warped -warp_id $skycell->{warp2} -skycell_id $skycell_id";
    462             my $warp2 = runToolAndParseExpectOne($command, $verbose);
    463             my_die("failed to find warpSkyfile for warpRun $skycell->{warp2} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp2;
    464 
    465             if ($warp2->{data_state} ne 'full') {
    466                 $warps_ready = 0;
    467                 $status = check_states_warp($warp2, 0, $rlabel, $need_magic);
    468             }
    469 
    470             if (!$warps_ready) {
    471                 # don't queue the diff update yet
    472                 return $status;
    473             }
    474 
    475         } elsif ($diff_mode == $IPP_DIFF_MODE_STACK_STACK ) {
    476             # check the state of the input stack
    477             my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
    478             my $stack1 = runToolAndParseExpectOne($command, $verbose);
    479             my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack1;
    480 
    481             if ($stack1->{state} ne 'full') {
    482                 print STDERR "input stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
    483                 # this faults all jobs depending on this dep_id
    484                 faultJobs('stop', 'diff', $diff_id, $PSTAMP_GONE);
    485                 return $PSTAMP_GONE;
    486             }
    487             # check the state of the template stack
    488             $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
    489             my $stack2 = runToolAndParseExpectOne($command, $verbose);
    490             my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack2;
    491 
    492             if ($stack2->{state} ne 'full') {
    493                 print STDERR "template stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
    494                 # this faults all jobs depending on this dep_id
    495                 faultJobs('stop', 'diff', $diff_id, $PSTAMP_GONE);
    496                 return $PSTAMP_GONE;
    497             }
    498 
    499             # inputs are ready fall through and queue the diff update
    500         } elsif ($diff_mode == $IPP_DIFF_MODE_STACK_WARP ) {
    501             # check the state of the input stack
    502             my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack1}";
    503             my $stack = runToolAndParseExpectOne($command, $verbose);
    504             my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack1}", $PS_EXIT_UNKNOWN_ERROR) if !$stack;
    505 
    506             if ($stack->{state} ne 'full') {
    507                 print STDERR "input stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
    508                 # this faults all jobs depending on this dep_id
    509                 faultJobs('stop', 'diff', $diff_id, $PSTAMP_GONE);
    510                 return $PSTAMP_GONE;
    511             }
    512 
    513             # now check the template warp
    514             $command = "$warptool -warped -warp_id $skycell->{warp2} -skycell_id $skycell_id";
    515             my $warp = runToolAndParseExpectOne($command, $verbose);
    516             my_die("failed to find warpSkyfile for warpRun $skycell->{warp2} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp;
    517 
    518             if ($warp->{data_state} ne 'full') {
    519                 return check_states_warp($warp, 0, $rlabel, $need_magic);
    520             }
    521             # warps are ready fall through and queue the diff update
     433                return $warp_status;
     434            }
     435        }
     436        $command = "$warptool -warped -warp_id $skycell->{warp2} -skycell_id $skycell_id";
     437        my $warp2 = runToolAndParseExpectOne($command, $verbose);
     438        my_die("failed to find warpSkyfile for warpRun $skycell->{warp2} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp2;
     439
     440        if ($warp2->{data_state} ne 'full') {
     441            $warps_ready = 0;
     442            $warp_status = check_states_warp($warp2, 0, $rlabel, $need_magic);
     443            if ($warp_status eq $PSTAMP_GONE) {
     444                faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     445            }
     446        }
     447
     448        if (!$warps_ready) {
     449            # don't queue the diff update yet
     450            return $warp_status;
     451        }
     452        # inputs are ready fall through and queue the diff update
     453
     454    } elsif ($diff_mode == $IPP_DIFF_MODE_STACK_STACK ) {
     455        # check the state of the input stack
     456        my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
     457        my $stack1 = runToolAndParseExpectOne($command, $verbose);
     458        my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack1;
     459
     460        if ($stack1->{state} ne 'full') {
     461            print STDERR "input stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
     462            faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     463            return $PSTAMP_GONE;
     464        }
     465        # check the state of the template stack
     466        $command = "$stacktool -sumskyfile -stack_id $skycell->{stack2}";
     467        my $stack2 = runToolAndParseExpectOne($command, $verbose);
     468        my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack2}", $PS_EXIT_UNKNOWN_ERROR) if !$stack2;
     469
     470        if ($stack2->{state} ne 'full') {
     471            print STDERR "template stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
     472            faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     473            return $PSTAMP_GONE;
     474        }
     475
     476        # inputs are ready fall through and queue the diff update
     477    } elsif ($diff_mode == $IPP_DIFF_MODE_STACK_WARP ) {
     478        # check the state of the input stack
     479        my $command = "$stacktool -sumskyfile -stack_id $skycell->{stack1}";
     480        my $stack = runToolAndParseExpectOne($command, $verbose);
     481        my_die("failed to find stackSumSkyfile for stack_id $skycell->{stack1}", $PS_EXIT_UNKNOWN_ERROR) if !$stack;
     482
     483        if ($stack->{state} ne 'full') {
     484            print STDERR "input stack for diffRun $diff_id $skycell_id is not in full state faulting jobs\n";
     485            faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     486            return $PSTAMP_GONE;
     487        }
     488
     489        # now check the template warp
     490        $command = "$warptool -warped -warp_id $skycell->{warp2} -skycell_id $skycell_id";
     491        my $warp = runToolAndParseExpectOne($command, $verbose);
     492        my_die("failed to find warpSkyfile for warpRun $skycell->{warp2} skycell_id $skycell_id", $PS_EXIT_UNKNOWN_ERROR) if !$warp;
     493
     494        if ($warp->{data_state} ne 'full') {
     495            my $warp_status = check_states_warp($warp, 0, $rlabel, $need_magic);
     496            if ($warp_status eq $PSTAMP_GONE) {
     497                faultComponent('diff', $diff_id, $skycell_id, $PSTAMP_GONE);
     498            }
     499            return $warp_status;
     500        }
     501        # warps are ready fall through and queue the diff update
     502    } else {
     503        my_die("unexpected diff_mode found: $diff_mode", $PS_EXIT_PROG_ERROR);
     504    }
     505
     506    if ($skycell->{data_state} ne 'update') {
     507        my $command = "$difftool -setskyfiletoupdate -diff_id $diff_id -skycell_id $skycell_id";
     508        $command .= " -set_label $rlabel" if $rlabel;
     509
     510        if (!$no_update) {
     511            my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
     512                        run(command => $command, verbose => $verbose);
     513            unless ($success) {
     514                my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
     515            }
    522516        } else {
    523             my_die("unexpected diff_mode found: $diff_mode", $PS_EXIT_PROG_ERROR);
    524         }
    525 
    526         if ($skycell->{data_state} ne 'update') {
    527             my $command = "$difftool -setskyfiletoupdate -diff_id $diff_id -skycell_id $skycell_id";
    528             $command .= " -set_label $rlabel" if $rlabel;
    529 
    530             if (!$no_update) {
    531                 my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
    532                             run(command => $command, verbose => $verbose);
    533                 unless ($success) {
    534                     my_die("failed to queue ${stage}Run $stage_id $component for update", $PS_EXIT_UNKNOWN_ERROR);
    535                 }
    536             } else {
    537                 print "skipping $command\n";
    538             }
    539         }
    540     } else {
    541         # XXX todo whole_run
    542         return $PS_EXIT_PROG_ERROR;
     517            print "skipping $command\n";
     518        }
    543519    }
    544520
     
    633609}
    634610
    635 sub faultJobs {
    636     my ($state, $stage, $stage_id, $job_fault) = @_;
    637 
    638     my $command = "$pstamptool -updatejob -set_state stop -set_fault $job_fault -dep_id $dep_id";
    639     $command .= " -dbname $dbname" if $dbname;
    640     $command .= " -dbserver $dbserver" if $dbserver;
    641     if (!$no_update) {
    642         my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
    643                     run(command => $command, verbose => $verbose);
    644         unless ($success) {
    645             my_die("failed to set pstampJob.fault for dep_id: $dep_id",
    646                 $PS_EXIT_UNKNOWN_ERROR);
    647         }
    648     } else {
    649         print "skipping $command\n";
    650     }
    651 }
    652 
     611# Check the data_state of the magicDSFile associated with this component (currently only chip stage is supported)
     612# Returns zero on success.
     613# returns the PSTAMP fault code to use for the jobs if an unrecoverable error is detected
     614# And faults the dependent if transient errors occur
    653615sub check_states_magicDSRun {
    654616    my $stage = shift;
     
    661623    my $dsRun_state = shift;
    662624
    663     # XXX: this code assumes that destreaking is handled at the chip stage
     625    # XXX: this code assumes that for update destreaking is only performed for chip stage
    664626    my_die ("check_states_magicDSRun only implemented for chip stage", $PS_EXIT_PROG_ERROR) if $stage ne 'chip';
    665627
     
    675637    if ($need_magic and !$input_magicked) {
    676638        if (!defined($dsRun_state) or ($dsRun_state eq 'NULL')) {
     639            # it is arguably a programming error if we get here
    677640            print "No magicDSRun for chipRun $stage_id and magic is required\n";
    678             faultJobs('stop', undef, undef, $PSTAMP_NOT_DESTREAKED);
     641            return $PSTAMP_NOT_DESTREAKED;
    679642        } elsif (($dsRun_state eq 'cleaned') or ($dsRun_state eq 'update')) {
    680643            foreach my $c (@$components) {
     
    684647                    my_die("failed to find magicDSFile for ${stage}Run $stage_id $c", $PS_EXIT_UNKNOWN_ERROR);
    685648                }
    686                 if ($dsfile->{fault} > 0) {
     649                if ($dsfile->{fault} eq $PSTAMP_GONE) {
     650                    print "magicDSFile has fault $PSTAMP_GONE\n";
     651                    return $PSTAMP_GONE;
     652                }
     653                #  destreak faults get cleared when the component is set to be updated
     654                if (($dsfile->{data_state} eq 'update') and ($dsfile->{fault} > 0)) {
     655                    $fault_count++;
     656                    if ($fault_count > $max_fault_count) {
     657                        print "Destreak file $magic_ds_id $component for $stage $stage_id has faulted $fault_count times. Giving up\n";
     658                        faultComponent('destreak', $magic_ds_id, $component, $PSTAMP_GONE);
     659                        return $PSTAMP_GONE;
     660                    }
     661                    # Assume fault is transient
    687662                    my_die("faulted magicDSFile for ${stage}Run $stage_id $c fault: $dsfile->{fault}",
    688663                        $PS_EXIT_UNKNOWN_ERROR);
     
    690665                if ($dsfile->{data_state} eq 'cleaned') {
    691666                    $command = "$magicdstool -setfiletoupdate -magic_ds_id $magic_ds_id -component $c";
    692                     # XXX: get the recoveryroot from a config file (it isn't actually used except to check whether it is in nebulous)
     667                    # XXX: get the recoveryroot from a config file
     668                    # (It isn't actually used except to check whether it is a nebulous path)
    693669                    $command .= " -set_recoveryroot neb://any/gpc1/destreak/recover";
    694670                    $command .= " -set_label $rlabel" if $rlabel;
     
    708684            }
    709685        } elsif ($dsRun_state eq 'failed_revert') {
     686            # XXX: revert failures are rarely fixed. give up but say it's just not available not GONE
    710687            print "magicDSRun.state = $dsRun_state for chipRun $stage_id is in state failed_revert cannot update\n";
    711             faultJobs('stop', undef, undef, $PSTAMP_NOT_AVAILABLE);
     688            return $PSTAMP_NOT_AVAILABLE;
    712689        } else {
    713690            print "magicDSRun.state = $dsRun_state for chipRun $stage_id";
     
    716693        }
    717694    }
     695
    718696    return 0;
     697}
     698
     699sub faultJobs {
     700    my ($job_fault) = @_;
     701
     702    my $command = "$pstamptool -updatejob -set_state stop -set_fault $job_fault -dep_id $dep_id";
     703    if (!$no_update) {
     704        my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
     705                    run(command => $command, verbose => $verbose);
     706        unless ($success) {
     707            my_die("failed to set pstampJob.fault for dep_id: $dep_id",
     708                $PS_EXIT_UNKNOWN_ERROR);
     709        }
     710    } else {
     711        print "skipping $command\n";
     712    }
     713}
     714
     715sub faultComponent {
     716    my ($stage, $stage_id, $component, $fault) = @_;
     717
     718    my $command;
     719    if ($stage eq 'chip') {
     720        $command = "$chiptool -updateprocessedimfile -chip_id $stage_id -class_id $component";
     721    } elsif ($stage eq 'warp') {
     722        $command = "$warptool -updateskyfile -warp_id $stage_id -skycell_id $component";
     723    } elsif ($stage eq 'diff') {
     724        $command = "$difftool -updatediffskyfile -diff_id $stage_id -skycell_id $component";
     725    } elsif ($stage eq 'destreak') {
     726        $command = "$magicdstool -updatedestreakedfile -magic_ds_id $stage_id -component $component";
     727    } else {
     728        my_die("unexpected stage $stage found", $PS_EXIT_PROG_ERROR);
     729    }
     730
     731    $command .= " -fault $fault";
     732
     733    if (!$no_update) {
     734        my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
     735                    run(command => $command, verbose => $verbose);
     736        unless ($success) {
     737            carp "$cmd failed";
     738        }
     739    } else {
     740        print "skipping $command\n";
     741    }
    719742}
    720743
     
    726749
    727750    my $command = "$pstamptool -updatedependent -set_fault $fault -dep_id $dep_id";
    728     $command .= " -dbname $dbname" if $dbname;
    729     $command .= " -dbserver $dbserver" if $dbserver;
    730751    if (!$no_update) {
    731752        my ( $success, $error_code, $full_buf, $stdout_buf, $stderr_buf ) =
Note: See TracChangeset for help on using the changeset viewer.