Index: trunk/psLib/src/fft/psImageFFT.c
===================================================================
--- trunk/psLib/src/fft/psImageFFT.c	(revision 29014)
+++ trunk/psLib/src/fft/psImageFFT.c	(revision 29592)
@@ -492,4 +492,35 @@
     FFTW_UNLOCK;
 
+#if 0
+    {
+        // Use this for inspecting the result of the fft XXX : K & I are backwards...
+        psImage *testKr = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testKi = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testIr = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testIi = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        for (int iy = 0; iy < fftRows; iy++) {
+	    for (int ix = 0; ix < fftCols; ix++) {
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+		testKr->data.F32[iy][ix] = creal(fft[ix + iy*fftCols]);
+		testKi->data.F32[iy][ix] = cimag(fft[ix + iy*fftCols]);
+		testIr->data.F32[iy][ix] = creal(fft[ix + iy*fftCols + fftPixels]);
+		testIi->data.F32[iy][ix] = cimag(fft[ix + iy*fftCols + fftPixels]);
+#else
+		testKr->data.F32[iy][ix] = fft[ix + iy*fftCols][0];
+		testKi->data.F32[iy][ix] = fft[ix + iy*fftCols][1];
+		testIr->data.F32[iy][ix] = fft[ix + iy*fftCols + fftPixels][0];
+		testIi->data.F32[iy][ix] = fft[ix + iy*fftCols + fftPixels][1];
+#endif
+	    }
+        }
+	fprintf (stderr, "generated test images\n");
+	fprintf (stderr, "please inspect\n");
+        psFree(testKr);
+        psFree(testKi);
+        psFree(testIr);
+        psFree(testIi);
+    }
+#endif
+
     // Multiply the two transforms
     for (int i = 0, j = fftPixels; i < fftPixels; i++, j++) {
@@ -534,41 +565,14 @@
 }
 
-void psKernelFFTFree(psKernelFFT *kernel) {
-
-    if (!kernel->fft) return;
-    
-    bool threaded = psThreadPoolSize(); // Are we running threaded?
-
-    FFTW_LOCK;
-    fftwf_free(kernel->fft);
-    FFTW_UNLOCK;
-
-    return;
-}
-
-psKernelFFT *psKernelFFTAlloc(const psKernel *input) {
-
-    psKernelFFT *kernel = psAlloc(sizeof(psKernelFFT));
-    psMemSetDeallocator(kernel, (psFreeFunc)psKernelFFTFree);
-
-    kernel->fft = NULL;
-
-    kernel->xMin = input->xMin;
-    kernel->xMax = input->xMax;
-    kernel->yMin = input->yMin;
-    kernel->yMax = input->yMax;
-
-    kernel->paddedCols = 0;
-    kernel->paddedRows = 0;
-
-    return kernel;
-}
-
-// generate the FFTed kernel image appropriate to be used for convolution of the given input image
-psKernelFFT *psImageConvolveKernelInit(const psImage *in, const psKernel *kernel)
+psImage *psImageConvolveFFTwithWindow(psImage *out, const psImage *in, const psImage *mask, psImageMaskType maskVal, const psKernel *kernel)
 {
     PS_ASSERT_IMAGE_NON_NULL(in, NULL);
     PS_ASSERT_IMAGE_TYPE(in, PS_TYPE_F32, NULL);
     PS_ASSERT_KERNEL_NON_NULL(kernel, NULL);
+    if (mask) {
+        PS_ASSERT_IMAGE_NON_NULL(mask, NULL);
+        PS_ASSERT_IMAGE_TYPE(mask, PS_TYPE_IMAGE_MASK, NULL);
+        PS_ASSERT_IMAGES_SIZE_EQUAL(mask, in, NULL);
+    }
 
     bool threaded = psThreadPoolSize(); // Are we running threaded?
@@ -577,5 +581,5 @@
     int xMin = kernel->xMin, xMax = kernel->xMax, yMin = kernel->yMin, yMax = kernel->yMax; // Kernel sizes
 
-    // Need to pad the kernel (and later the input image) to protect from wrap-around effects
+    // Need to pad the input image to protect from wrap-around effects
     if (xMax - xMin > numCols || yMax - yMin > numRows) {
         // Cannot pad the image if the kernel is larger.
@@ -607,18 +611,43 @@
     int numPadded = paddedCols * paddedRows; // Number of pixels in padded image
 
-    // Create data array containing the padded kernel
-    FFTW_LOCK;
-    psF32 *data = fftwf_malloc(numPadded * PSELEMTYPE_SIZEOF(PS_TYPE_F32)); // Data for FFTW
-    FFTW_UNLOCK;
-
-    // Generate the padded kernel image.  We could generate the padded kernel image using
-    // memcpy, but by going pixel by pixel we can apply the normalisation that corrects for the
-    // FFT renormalisation.  By applying it to the kernel here, we save applying it to the
-    // output image(s).
-
-    // copy kernel into data array
+    // Create data array containing the padded image and padded kernel
+    FFTW_LOCK;
+    psF32 *data = fftwf_malloc(2 * numPadded * PSELEMTYPE_SIZEOF(PS_TYPE_F32)); // Data for FFTW
+    FFTW_UNLOCK;
     psF32 *dataPtr = data;              // Pointer into FFTW data
+    psF32 **imageData = in->data.F32;   // Pointer into image data
+
+    // Image part of data array
+    size_t goodBytes = numCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes per image row
+    size_t padBytes = (paddedCols - numCols) * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes to pad
+    for (int y = 0; y < numRows; y++, dataPtr += paddedCols, imageData++) {
+        memcpy(dataPtr, *imageData, goodBytes);
+        memset(dataPtr + numCols, 0, padBytes);
+    }
+    memset(dataPtr, 0, (paddedRows - numRows) * paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
+
+#if 0
+    {
+        // Use this for inspecting the result of copying the image
+        psImage *test = psImageAlloc(paddedCols, paddedRows, PS_TYPE_F32);
+        psFree(test->p_rawDataBuffer);
+        test->p_rawDataBuffer = data;
+        test->data.V[0] = test->p_rawDataBuffer;
+        for (int y = 1; y < paddedRows; y++) {
+            test->data.V[y] = (psPtr)((int8_t *)test->data.V[y - 1] +
+                                      paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
+        }
+        // View image here
+        test->p_rawDataBuffer = NULL;
+        psFree(test);
+    }
+#endif
+
+    // Kernel part of data array
+    dataPtr = data + numPadded;         // Reset to kernel image location
     float norm = 1.0 / (float)(paddedRows * paddedCols); // Normalisation to correct for FFT
-
+    // We could generate the padded kernel image using memcpy, but by going pixel by pixel we can apply the
+    // normalisation that corrects for the FFT renormalisation.  By applying it to the kernel here, we save
+    // applying it to the entire output image.
     int xNegMin = PS_MIN(-1, xMin), xNegMax = PS_MIN(-1, xMax); // Min and max for x when negative
     int xPosMin = PS_MAX(0, xMin), xPosMax = PS_MAX(0, xMax); // Min and max for x when positive
@@ -665,4 +694,281 @@
         psImage *test = psImageAlloc(paddedCols, paddedRows, PS_TYPE_F32);
         psFree(test->p_rawDataBuffer);
+        test->p_rawDataBuffer = &data[numPadded];
+        test->data.V[0] = test->p_rawDataBuffer;
+        for (int y = 1; y < paddedRows; y++) {
+            test->data.V[y] = (psPtr)((int8_t *)test->data.V[y - 1] +
+                                      paddedCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
+        }
+        // View image here
+        test->p_rawDataBuffer = NULL;
+        psFree(test);
+    }
+#endif
+
+    // Mask bad pixels (which may be NANs), lest they infect everything
+    if (mask && maskVal) {
+        for (int y = 0; y < numRows; y++) {
+            for (int x = 0; x < numCols; x++) {
+                if (mask->data.PS_TYPE_IMAGE_MASK_DATA[y][x] & maskVal) {
+                    data[x + paddedCols * y] = 0;
+                }
+            }
+        }
+    }
+
+    // Do the forward FFT
+    // Note that the FFT images have different size from the input
+    FFTW_LOCK;
+    fftwf_complex *fft = fftwf_malloc(2 * (paddedCols/2 + 1) * paddedRows * sizeof(fftwf_complex)); // FFT
+    FFTW_UNLOCK;
+    int size[] = { paddedRows, paddedCols }; // Size of transforms
+    int fftCols = paddedCols/2 + 1, fftRows = paddedRows; // Size of FFT images
+    int fftPixels = fftCols * fftRows;  // Number of pixels in FFT image
+
+    FFTW_LOCK;
+    fftwf_plan forward = fftwf_plan_many_dft_r2c(2, size, 2, data, NULL, 1, paddedCols * paddedRows,
+                                                 fft, NULL, 1, fftPixels, FFTW_PLAN_RIGOR);
+    FFTW_UNLOCK;
+
+    fftwf_execute(forward);
+
+    FFTW_LOCK;
+    fftwf_destroy_plan(forward);
+    FFTW_UNLOCK;
+
+    // apply a window function:
+    psVector *xWindow = psVectorAlloc(fftCols, PS_TYPE_F32);
+    psVector *yWindow = psVectorAlloc(fftRows, PS_TYPE_F32);
+
+    // not sure what window function to use.  trying a simple Gaussian.
+    // In the x-direction, the window goes from ~1 @ x = 0 to something low at x = fftCols
+    // In the y direction, the window goes to a small value at y = fftRows / 2, and back up
+    for (int i = 0; i < fftCols; i++) {
+	float xNorm = 2.0 * i / (float) fftCols; // sigma = 0.5
+	xWindow->data.F32[i] = exp(-0.5*xNorm*xNorm);
+    }
+    for (int i = 0; i < fftRows / 2 + 1; i++) {
+	float xNorm = 2.0 * i / (float) (fftRows / 2.0); // sigma = 0.5
+	yWindow->data.F32[i] = exp(-0.5*xNorm*xNorm);
+	yWindow->data.F32[fftRows - 1 - i] = yWindow->data.F32[i];
+    }
+
+    for (int iy = 0; iy < fftRows; iy++) {
+	float yValue = yWindow->data.F32[iy];
+	for (int ix = 0; ix < fftCols; ix++) {
+	    float xValue = xWindow->data.F32[ix];
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+	    fft[ix + iy*fftCols + fftPixels] *= xValue * yValue; // kernel element (real + I*imaginary)
+#else
+	    fft[ix + iy*fftCols + fftPixels][0] *= xValue * yValue;
+	    fft[ix + iy*fftCols + fftPixels][1] *= xValue * yValue;
+#endif
+	}
+    }
+
+#if 0
+    {
+        // Use this for inspecting the result of the fft XXX : K & I are backwards...
+        psImage *testKr = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testKi = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testIr = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        psImage *testIi = psImageAlloc(fftCols, fftRows, PS_TYPE_F32);
+        for (int iy = 0; iy < fftRows; iy++) {
+	    for (int ix = 0; ix < fftCols; ix++) {
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+		testIr->data.F32[iy][ix] = creal(fft[ix + iy*fftCols]);
+		testIi->data.F32[iy][ix] = cimag(fft[ix + iy*fftCols]);
+		testKr->data.F32[iy][ix] = creal(fft[ix + iy*fftCols + fftPixels]);
+		testKi->data.F32[iy][ix] = cimag(fft[ix + iy*fftCols + fftPixels]);
+#else
+		testIr->data.F32[iy][ix] = fft[ix + iy*fftCols][0];
+		testIi->data.F32[iy][ix] = fft[ix + iy*fftCols][1];
+		testKr->data.F32[iy][ix] = fft[ix + iy*fftCols + fftPixels][0];
+		testKi->data.F32[iy][ix] = fft[ix + iy*fftCols + fftPixels][1];
+#endif
+	    }
+        }
+	fprintf (stderr, "generated test images\n");
+	fprintf (stderr, "please inspect\n");
+        psFree(testKr);
+        psFree(testKi);
+        psFree(testIr);
+        psFree(testIi);
+    }
+#endif
+
+    // Multiply the two transforms
+    for (int i = 0, j = fftPixels; i < fftPixels; i++, j++) {
+        // (a + bi) * (c + di) = (ac - bd) + (bc + ad)i
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+        // C99 complex support
+        fft[i] *= fft[j];
+#else
+        // FFTW's backup complex support
+        float imageReal = fft[i][0], imageImag = fft[i][1];
+        float kernelReal = fft[j][0], kernelImag = fft[j][1];
+        fft[i][0] = imageReal * kernelReal - imageImag * kernelImag;
+        fft[i][1] = imageImag * kernelReal + imageReal * kernelImag;
+#endif
+    }
+
+    // Do the backward FFT
+    FFTW_LOCK;
+    fftwf_plan backward = fftwf_plan_dft_c2r_2d(paddedRows, paddedCols, fft, data, FFTW_PLAN_RIGOR);
+    FFTW_UNLOCK;
+
+    fftwf_execute(backward);
+
+    FFTW_LOCK;
+    fftwf_destroy_plan(backward);
+    fftwf_free(fft);
+    FFTW_UNLOCK;
+
+    // Copy into the target, without the padding
+    out = psImageRecycle(out, numCols, numRows, PS_TYPE_F32);
+    psF32 **outData = out->data.F32;    // Pointer into output
+    dataPtr = data;                     // Reset to start
+    for (int y = 0; y < numRows; y++, outData++, dataPtr += paddedCols) {
+        memcpy(*outData, dataPtr, goodBytes);
+    }
+
+    FFTW_LOCK;
+    fftwf_free(data);
+    FFTW_UNLOCK;
+
+    return out;
+}
+
+void psKernelFFTFree(psKernelFFT *kernel) {
+
+    if (!kernel->fft) return;
+    
+    bool threaded = psThreadPoolSize(); // Are we running threaded?
+
+    FFTW_LOCK;
+    fftwf_free(kernel->fft);
+    FFTW_UNLOCK;
+
+    return;
+}
+
+psKernelFFT *psKernelFFTAlloc(const psKernel *input) {
+
+    psKernelFFT *kernel = psAlloc(sizeof(psKernelFFT));
+    psMemSetDeallocator(kernel, (psFreeFunc)psKernelFFTFree);
+
+    kernel->fft = NULL;
+
+    kernel->xMin = input->xMin;
+    kernel->xMax = input->xMax;
+    kernel->yMin = input->yMin;
+    kernel->yMax = input->yMax;
+
+    kernel->paddedCols = 0;
+    kernel->paddedRows = 0;
+
+    return kernel;
+}
+
+// generate the FFTed kernel image appropriate to be used for convolution of the given input image
+psKernelFFT *psImageConvolveKernelInit(const psImage *in, const psKernel *kernel)
+{
+    PS_ASSERT_IMAGE_NON_NULL(in, NULL);
+    PS_ASSERT_IMAGE_TYPE(in, PS_TYPE_F32, NULL);
+    PS_ASSERT_KERNEL_NON_NULL(kernel, NULL);
+
+    bool threaded = psThreadPoolSize(); // Are we running threaded?
+
+    int numCols = in->numCols, numRows = in->numRows; // Size of image
+    int xMin = kernel->xMin, xMax = kernel->xMax, yMin = kernel->yMin, yMax = kernel->yMax; // Kernel sizes
+
+    // Need to pad the kernel (and later the input image) to protect from wrap-around effects
+    if (xMax - xMin > numCols || yMax - yMin > numRows) {
+        // Cannot pad the image if the kernel is larger.
+        psError(PS_ERR_BAD_PARAMETER_SIZE, true,
+                _("Kernel cannot extend further than input image size (%dx%d vs %dx%d)."),
+                xMax, yMax, numCols, numRows);
+        return NULL;
+    }
+
+    int paddedCols = numCols + PS_MAX(-xMin, xMax); // Number of columns in padded image
+    int paddedRows = numRows + PS_MAX(-yMin, yMax); // Number of rows in padded image
+
+#if CONVOLVE_FFT_BINARY_SIZE
+    // Make the size an integer power of two
+    {
+        int twoCols, twoRows;           // Size that is a factor of two
+        for (twoCols = 1; twoCols <= paddedCols && twoCols < INT_MAX - 1; twoCols <<= 1); // No action
+        for (twoRows = 1; twoRows <= paddedRows && twoRows < INT_MAX - 1; twoRows <<= 1); // No action
+        if (paddedCols > twoCols || paddedRows > twoRows) {
+            psError(PS_ERR_BAD_PARAMETER_SIZE, true, "Unable to scale size (%dx%d) up to factor of two",
+                    paddedCols, paddedRows);
+            return NULL;
+        }
+        paddedCols = twoCols;
+        paddedRows = twoRows;
+    }
+#endif
+
+    int numPadded = paddedCols * paddedRows; // Number of pixels in padded image
+
+    // Create data array containing the padded kernel
+    FFTW_LOCK;
+    psF32 *data = fftwf_malloc(numPadded * PSELEMTYPE_SIZEOF(PS_TYPE_F32)); // Data for FFTW
+    FFTW_UNLOCK;
+
+    // Generate the padded kernel image.  We could generate the padded kernel image using
+    // memcpy, but by going pixel by pixel we can apply the normalisation that corrects for the
+    // FFT renormalisation.  By applying it to the kernel here, we save applying it to the
+    // output image(s).
+
+    // copy kernel into data array
+    psF32 *dataPtr = data;              // Pointer into FFTW data
+    float norm = 1.0 / (float)(paddedRows * paddedCols); // Normalisation to correct for FFT
+
+    int xNegMin = PS_MIN(-1, xMin), xNegMax = PS_MIN(-1, xMax); // Min and max for x when negative
+    int xPosMin = PS_MAX(0, xMin), xPosMax = PS_MAX(0, xMax); // Min and max for x when positive
+    int yNegMin = PS_MIN(-1, yMin), yNegMax = PS_MIN(-1, yMax); // Min and max for x when negative
+    int yPosMin = PS_MAX(0, yMin), yPosMax = PS_MAX(0, yMax); // Min and max for x when positive
+    int blankCols = xNegMin + paddedCols - xPosMax - 1; // Number of columns between kernel extrema
+    int blankRows = (yNegMin + paddedRows - yPosMax - 1) * paddedCols; // Rows between kernel extrema
+    size_t blankColBytes = blankCols * PSELEMTYPE_SIZEOF(PS_TYPE_F32); // Number of bytes in blankCols
+    for (int y = yPosMin; y <= yPosMax; y++) {
+        // y is positive
+        for (int x = xPosMin; x <= xPosMax; x++, dataPtr++) {
+            // x is positive
+            *dataPtr = kernel->kernel[y][x] * norm;
+        }
+        // Columns between kernel extrema
+        memset(dataPtr, 0, blankColBytes);
+        dataPtr += blankCols;
+        for (int x = xNegMin; x <= xNegMax; x++, dataPtr++) {
+            // x is negative
+            *dataPtr = kernel->kernel[y][x] * norm;
+        }
+    }
+    // Rows between kernel extrema
+    memset(dataPtr, 0, blankRows * PSELEMTYPE_SIZEOF(PS_TYPE_F32));
+    dataPtr += blankRows;
+    for (int y = yNegMin; y <= yNegMax; y++) {
+        // y is negative
+        for (int x = xPosMin; x <= xPosMax; x++, dataPtr++) {
+            // x is positive
+            *dataPtr = kernel->kernel[y][x] * norm;
+        }
+        // Columns between kernel extrema
+        memset(dataPtr, 0, blankColBytes);
+        dataPtr += blankCols;
+        for (int x = xNegMin; x <= xNegMax; x++, dataPtr++) {
+            // x is negative
+            *dataPtr = kernel->kernel[y][x] * norm;
+        }
+    }
+
+#if 0
+    {
+        // Use this for inspecting the result of copying the kernel
+        psImage *test = psImageAlloc(paddedCols, paddedRows, PS_TYPE_F32);
+        psFree(test->p_rawDataBuffer);
         test->p_rawDataBuffer = data;
         test->data.V[0] = test->p_rawDataBuffer;