IPP Software Navigation Tools IPP Links Communication Pan-STARRS Links

Ignore:
Timestamp:
Apr 4, 2008, 12:44:56 PM (18 years ago)
Author:
Paul Price
Message:

Optimising psImageConvolveFFT. Made a couple of optimisations: 1. Moved code into psImageFFT.c, so I can use FFTW directly. FFTW, through its 'advanced interface' allows two FFTs to be performed at the same time and 'the resulting plans can often be faster than calling FFTW multiple times for the individual transforms' (FFTW 3.1.2 manual, p30). 2. The convolved image needs to be normalised because FFTW doesn't take out the sqrt(N) factors in the FFT. Instead of multiplying the entire convolved image, we can multiply the much smaller number of pixels in the kernel. Tested this code with tap_psImageConvolve2 and it seems to work. Running convolutionBench unoptimised on mithrandir produces times that are comparable to what I got a while back running optimised. Running it optimized on alala produces speed-ups ranging from 25 to 350%.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/psLib/src/fft/psImageFFT.c

    r11716 r17320  
    66/// @author Robert DeSonia, MHPCC
    77///
    8 /// @version $Revision: 1.23 $ $Name: not supported by cvs2svn $
    9 /// @date $Date: 2007-02-09 00:22:55 $
     8/// @version $Revision: 1.24 $ $Name: not supported by cvs2svn $
     9/// @date $Date: 2008-04-04 22:44:56 $
    1010///
    1111/// Copyright 2004-2005 Maui High Performance Computing Center, University of Hawaii
     
    2727#include "psConstants.h"
    2828#include "psImageStructManip.h"
     29#include "psImageConvolve.h"
    2930#include "psImageFFT.h"
    3031
     
    191192    return real;
    192193}
    193 
    194194
    195195
     
    274274    return true;
    275275}
     276
     277
     278psImage *psImageConvolveFFT(psImage *out, const psImage *in, const psImage *mask, psMaskType maskVal,
     279                            const psKernel *kernel)
     280{
     281    PS_ASSERT_IMAGE_NON_NULL(in, NULL);
     282    PS_ASSERT_IMAGE_TYPE(in, PS_TYPE_F32, NULL);
     283    PS_ASSERT_KERNEL_NON_NULL(kernel, NULL);
     284    if (mask) {
     285        PS_ASSERT_IMAGE_NON_NULL(mask, NULL);
     286        PS_ASSERT_IMAGE_TYPE(mask, PS_TYPE_MASK, NULL);
     287        PS_ASSERT_IMAGES_SIZE_EQUAL(mask, in, NULL);
     288    }
     289
     290    int numCols = in->numCols, numRows = in->numRows; // Size of image
     291    int xMin = kernel->xMin, xMax = kernel->xMax, yMin = kernel->yMin, yMax = kernel->yMax; // Kernel sizes
     292
     293    // Need to pad the input image to protect from wrap-around effects
     294    if (xMax - xMin > numCols || yMax - yMin > numRows) {
     295        // Cannot pad the image if the kernel is larger.
     296        psError(PS_ERR_BAD_PARAMETER_SIZE, true,
     297                _("Kernel cannot extend further than input image size (%dx%d vs %dx%d)."),
     298                xMax, yMax, numCols, numRows);
     299        return NULL;
     300    }
     301    int paddedCols = numCols + PS_MAX(-xMin, xMax); // Number of columns in padded image
     302    int paddedRows = numRows + PS_MAX(-yMin, yMax); // Number of rows in padded image
     303    int numPadded = paddedCols * paddedRows; // Number of pixels in padded image
     304
     305    // Create data array containing the padded image and padded kernel
     306    psF32 *data = fftwf_malloc(2 * numPadded * PSELEMTYPE_SIZEOF(PS_TYPE_F32)); // Data for FFTW
     307    psF32 *dataPtr = data;              // Pointer into FFTW data
     308    psF32 **imageData = in->data.F32;   // Pointer into image data
     309
     310    // Image part of data array
     311    size_t goodBytes = numCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes per image row
     312    size_t padBytes = (paddedCols - numCols) * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes to pad
     313    for (int y = 0; y < numRows; y++, dataPtr += paddedCols, imageData++) {
     314        memcpy(dataPtr, *imageData, goodBytes);
     315        memset(dataPtr + numCols, 0, padBytes);
     316    }
     317    memset(dataPtr, 0, (paddedRows - numRows) * paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
     318
     319#if 0
     320    {
     321        // Use this for inspecting the result of copying the image
     322        psImage *test = psImageAlloc(paddedCols, paddedRows, PS_TYPE_F32);
     323        psFree(test->p_rawDataBuffer);
     324        test->p_rawDataBuffer = data;
     325        test->data.V[0] = test->p_rawDataBuffer;
     326        for (int y = 1; y < paddedRows; y++) {
     327            test->data.V[y] = (psPtr)((int8_t *)test->data.V[y - 1] +
     328                                      paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
     329        }
     330        // View image here
     331        test->p_rawDataBuffer = NULL;
     332        psFree(test);
     333    }
     334#endif
     335
     336    // Kernel part of data array
     337    dataPtr = data + numPadded;         // Reset to kernel image location
     338    float norm = 1.0 / (float)(paddedRows * paddedCols); // Normalisation to correct for FFT
     339    // We could generate the padded kernel image using memcpy, but by going pixel by pixel we can apply the
     340    // normalisation that corrects for the FFT renormalisation.  By applying it to the kernel here, we save
     341    // applying it to the entire output image.
     342    int xNegMin = PS_MIN(-1, xMin), xNegMax = PS_MIN(-1, xMax); // Min and max for x when negative
     343    int xPosMin = PS_MAX(0, xMin), xPosMax = PS_MAX(0, xMax); // Min and max for x when positive
     344    int yNegMin = PS_MIN(-1, yMin), yNegMax = PS_MIN(-1, yMax); // Min and max for x when negative
     345    int yPosMin = PS_MAX(0, yMin), yPosMax = PS_MAX(0, yMax); // Min and max for x when positive
     346    int blankCols = xNegMin + paddedCols - xPosMax - 1; // Number of columns between kernel extrema
     347    int blankRows = (yNegMin + paddedRows - yPosMax - 1) * paddedCols; // Rows between kernel extrema
     348    size_t blankColBytes = blankCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes in blankCols
     349    for (int y = yPosMin; y <= yPosMax; y++) {
     350        // y is positive
     351        for (int x = xPosMin; x <= xPosMax; x++, dataPtr++) {
     352            // x is positive
     353            *dataPtr = kernel->kernel[y][x] * norm;
     354        }
     355        // Columns between kernel extrema
     356        memset(dataPtr, 0, blankColBytes);
     357        dataPtr += blankCols;
     358        for (int x = xNegMin; x <= xNegMax; x++, dataPtr++) {
     359            // x is negative
     360            *dataPtr = kernel->kernel[y][x] * norm;
     361        }
     362    }
     363    // Rows between kernel extrema
     364    memset(dataPtr, 0, blankRows * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
     365    dataPtr += blankRows;
     366    for (int y = yNegMin; y <= yNegMax; y++) {
     367        // y is negative
     368        for (int x = xPosMin; x <= xPosMax; x++, dataPtr++) {
     369            // x is positive
     370            *dataPtr = kernel->kernel[y][x] * norm;
     371        }
     372        // Columns between kernel extrema
     373        memset(dataPtr, 0, blankColBytes);
     374        dataPtr += blankCols;
     375        for (int x = xNegMin; x <= xNegMax; x++, dataPtr++) {
     376            // x is negative
     377            *dataPtr = kernel->kernel[y][x] * norm;
     378        }
     379    }
     380
     381#if 0
     382    {
     383        // Use this for inspecting the result of copying the kernel
     384        psImage *test = psImageAlloc(paddedCols, paddedRows, PS_TYPE_F32);
     385        psFree(test->p_rawDataBuffer);
     386        test->p_rawDataBuffer = &data[numPadded];
     387        test->data.V[0] = test->p_rawDataBuffer;
     388        for (int y = 1; y < paddedRows; y++) {
     389            test->data.V[y] = (psPtr)((int8_t *)test->data.V[y - 1] +
     390                                      paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
     391        }
     392        // View image here
     393        test->p_rawDataBuffer = NULL;
     394        psFree(test);
     395    }
     396#endif
     397
     398    // Mask bad pixels (which may be NANs), lest they infect everything
     399    if (mask && maskVal) {
     400        for (int y = 0; y < numRows; y++) {
     401            for (int x = 0; x < numCols; x++) {
     402                if (mask->data.PS_TYPE_MASK_DATA[y][x] & maskVal) {
     403                    data[x + paddedCols * y] = 0;
     404                }
     405            }
     406        }
     407    }
     408
     409    // Do the forward FFT
     410    // Note that the FFT images have different size from the input
     411    fftwf_complex *fft = fftwf_malloc(2 * (paddedCols/2 + 1) * paddedRows * sizeof(fftwf_complex)); // FFT
     412    int size[] = { paddedCols, paddedRows }; // Size of transforms
     413    int fftCols = paddedCols/2 + 1, fftRows = paddedRows; // Size of FFT images
     414    int fftPixels = fftCols * fftRows;  // Number of pixels in FFT image
     415    fftwf_plan forward = fftwf_plan_many_dft_r2c(2, size, 2, data, NULL, 1, paddedCols * paddedRows,
     416                                                 fft, NULL, 1, fftPixels, FFTW_PLAN_RIGOR);
     417    fftwf_execute(forward);
     418    fftwf_destroy_plan(forward);
     419
     420    // Multiply the two transforms
     421    for (int i = 0, j = fftPixels; i < fftPixels; i++, j++) {
     422        // (a + bi) * (c + di) = (ac - bd) + (bc + ad)i
     423#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
     424        // C99 complex support
     425        fft[i] *= fft[j];
     426#else
     427        // FFTW's backup complex support
     428        float imageReal = fft[i][0], imageImag = fft[i][1];
     429        float kernelReal = fft[j][0], kernelImag = fft[j][1];
     430        fft[i][0] = imageReal * kernelReal - imageImag * kernelImag;
     431        fft[i][1] = imageImag * kernelReal + imageReal * kernelImag;
     432#endif
     433    }
     434
     435    // Do the backward FFT
     436    fftwf_plan backward = fftwf_plan_dft_c2r_2d(paddedRows, paddedCols, fft, data, FFTW_PLAN_RIGOR);
     437    fftwf_execute(backward);
     438    fftwf_destroy_plan(backward);
     439    fftwf_free(fft);
     440
     441    // Copy into the target, without the padding
     442    out = psImageRecycle(out, numCols, numRows, PS_TYPE_F32);
     443    psF32 **outData = out->data.F32;    // Pointer into output
     444    dataPtr = data;                     // Reset to start
     445    for (int y = 0; y < numRows; y++, outData++, dataPtr += paddedCols) {
     446        memcpy(*outData, dataPtr, goodBytes);
     447    }
     448    //    fftwf_free(data);
     449
     450    return out;
     451}
Note: See TracChangeset for help on using the changeset viewer.