IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Ignore:
Timestamp:
Jan 15, 2010, 3:01:31 PM (16 years ago)
Author:
eugene
Message:

handle situations where the data range is absurd due to a smallish number of insane outliers

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/psLib/src/math/psStats.c

    r25884 r26615  
    749749    // Iterate to get the best bin size; an iteration limit is enforced at the bottom of the loop.
    750750    for (int iterate = 1; iterate > 0; iterate++) {
    751         psTrace(TRACE, 6,
    752                 "-------------------- Iterating on Bin size.  Iteration number %d --------------------\n",
    753                 iterate);
     751        psTrace(TRACE, 6, "-------------------- Iterating on Bin size.  Iteration number %d --------------------\n", iterate);
    754752
    755753        // Get the minimum and maximum values
     
    791789        psTrace(TRACE, 6, "Initial robust bin size is %.2f\n", binSize);
    792790
    793         // ADD step 0: Construct the histogram with the specified bin size.  NOTE: we can not specify the bin
    794         // size precisely since the argument to psHistogramAlloc() is the number of bins, not the binSize.  If
    795         // we get here, we know that binSize != 0.0.
    796         long numBins = (max - min) / binSize; // Number of bins
     791        // ADD step 0: Construct the histogram with the specified bin size.  NOTE: we can
     792        // not specify the bin size precisely since the argument to psHistogramAlloc() is
     793        // the number of bins, not the binSize.  If we get here, we know that binSize !=
     794        // 0.0.  We can also have a floating-point round-off error such that the last bin
     795        // of the histogram does not correspond exactly with the value of 'max'.  Let's be
     796        // a bit generous and extend the histogram by two bins in either direction
     797        long numBins = 4 + (max - min) / binSize; // Number of bins
    797798        psTrace(TRACE, 6, "Numbins is %ld\n", numBins);
    798799        psTrace(TRACE, 6, "Creating a robust histogram from data range (%.2f - %.2f)\n", min, max);
    799800        // Generate the histogram
    800         histogram = psHistogramAlloc(min, max, numBins);
     801        histogram = psHistogramAlloc(min - 2.0*binSize, max + 2.0*binSize, numBins);
    801802        // XXXXX we need to consider this step if errors -> variance
    802803        if (!psVectorHistogram(histogram, myVector, errors, mask, maskVal)) {
     
    807808            psFree(statsMinMax);
    808809            psFree(mask);
    809 
    810810            return false;
    811811        }
     
    814814            PS_VECTOR_PRINT_F32(histogram->nums);
    815815        }
     816
     817        // perversity check: if most of the values land in a single bin, then we probably
     818        // have a perverse case (eg, small number of points at extremely large / small
     819        // values; nearly bi-modal distribution).  if so, keep only points within 5? 10?
     820        // bins of that excess bin:
     821        int nMaxBin = 0;
     822        int iMaxBin = 0;
     823        for (long i = 1; i < histogram->nums->n; i++) {
     824            if (histogram->nums->data.F32[i] > nMaxBin) {
     825                nMaxBin = histogram->nums->data.F32[i];
     826                iMaxBin = i;
     827            }
     828        }
     829        if (nMaxBin > numValid / 2) {
     830            float minKeep = histogram->bounds->data.F32[iMaxBin] - 10*binSize;
     831            float maxKeep = histogram->bounds->data.F32[iMaxBin + 1] + 10*binSize;
     832            int nInvalid = 0;
     833            for (long i = 0; i < myVector->n; i++) {
     834                // skip the already-masked values
     835                if (mask->data.PS_TYPE_VECTOR_MASK_DATA[i] & maskVal) continue;
     836                bool invalid = false;
     837                invalid |= (myVector->data.F32[i] < minKeep);
     838                invalid |= (myVector->data.F32[i] > maxKeep);
     839                invalid |= (!isfinite(myVector->data.F32[i]));
     840                if (!invalid) continue;
     841                mask->data.PS_TYPE_VECTOR_MASK_DATA[i] = maskVal;
     842                nInvalid ++;
     843            }
     844            psTrace(TRACE, 6, "data is concentrated in a single bin, masking %d extreme outliers and retrying\n", nInvalid);
     845            psFree(histogram);
     846            psFree(cumulative);
     847            continue;
     848        }
    816849
    817850        // ADD step 1: Convert the specific histogram to a cumulative histogram
     
    10071040    stats->robustN50 = N50;
    10081041    psTrace(TRACE, 6, "The robustN50 is %ld.\n", N50);
     1042    psTrace(TRACE, 6, "The robust median and stdev are %f, %f\n", stats->robustMedian, stats->robustStdev);
    10091043
    10101044    // Clean up
Note: See TracChangeset for help on using the changeset viewer.