Quantcast
Channel: Intel® Software - Intel® Many Integrated Core Architecture (Intel MIC Architecture)
Viewing all articles
Browse latest Browse all 1789

How to get peak performnace in FFT

$
0
0

Hi,

I want to get peak performance using batch FFT. I am giving sample code below that i have written. I have compiled on host using command icc -mkl fftcheck.cpp -o fftchecknew -L/opt/intel/mkl/lib/intel64. Also i have set some environment variables mentioned below.

export  MIC_ENV_PREFIX=MIC

export  MIC_KMP_AFFINITY=scatter,granularity=fine

export  MIC_OMP_NUM_THREADS=240

export MIC_USE_2MB_BUFFERS=64K.

./fftchecknew 512 1024 1024.

output

1024 512 627.922913

Here number of operations= 2.5* M* N*(log2(M*N))*numberOfTransforms. here M = 1024, N=1024, numberOfTransforms = 512.

So gflops = operations/time = (26843545600/627.93* 10^6) = 42Gflops. I have tried all the optimizatons suggested in http://software.intel.com/en-us/articles/tuning-the-intel-mkl-dft-functi....

but there they are using different FFT function and getting 100+gflops. Is that the reason to have less gflops? Can some one please tell me How to improve performnace?

#pragma offload_attribute(push, target(mic))

#include <mkl.h>

#include<immintrin.h>

#pragma offload_attribute(pop)

#include <sys/time.h>

#include <stdio.h>

#include <math.h>

#include <time.h>

#include <omp.h>

#define NUM_THREADS 1

__declspec(target(mic)) float* sxdreal;

__declspec(target(mic)) float* sxdimag;

__declspec(target(mic)) DFTI_DESCRIPTOR_HANDLE hand1[NUM_THREADS];

int main(int argc, char* argv[])

{

int transforms = atoi(argv[1]);

int rows = atoi(argv[2]);

int cols = atoi(argv[3]);

#pragma offload target(mic) in(transforms, rows,cols) nocopy(sxdreal,sxdimag,hand1)

    {

        sxdreal = (float*)_mm_malloc(sizeof(float)*transforms*rows*(cols),64);

        sxdimag = (float*)_mm_malloc(sizeof(float)*transforms*rows*(cols),64);

        MKL_LONG status;

        MKL_LONG N1[2]; N1[0] = rows; N1[1] = cols;

        for(int i=0; i<NUM_THREADS;i++)

        {

            status = DftiCreateDescriptor(&hand1[i], DFTI_SINGLE, DFTI_COMPLEX, 2,N1);

            status = DftiSetValue(hand1[i], DFTI_COMPLEX_STORAGE, DFTI_REAL_REAL);

            status = DftiSetValue(hand1[i], DFTI_NUMBER_OF_TRANSFORMS, transforms);

            status = DftiSetValue(hand1[i], DFTI_INPUT_DISTANCE, rows*(cols));

            if (0 != status) printf("failed\n");

            status = DftiCommitDescriptor(hand1[i]);

        }

    }

    #pragma offload target(mic)  nocopy(sxdreal,sxdimag,hand1)

        {

        MKL_LONG status;

        status = DftiComputeForward(hand1[0],sxdreal, sxdimag);

        }

    

    double time = clock();

    timeval m_timer;

    gettimeofday(&m_timer, NULL);

#pragma offload target(mic)  nocopy(sxdreal,sxdimag,hand1)

        {

            MKL_LONG status;

            int thread = 0;//omp_get_thread_num();

            for(int i=0; i< 10; i++)

            status = DftiComputeForward(hand1[thread],sxdreal, sxdimag);

            if (0 != status) printf("failed\n");

        }

        

    timeval now;

    gettimeofday(&now,NULL);

    double secs = now.tv_sec-m_timer.tv_sec;

    double usecs = now.tv_usec-m_timer.tv_usec;

    if (usecs<0)

    {

    secs--;

    usecs+=1000000;

    }

    float diff = secs*1000+usecs/1000+0.5;

    double remain = clock()-time;

    printf("%d %d %f\n",rows, transforms,diff/10);

    

#pragma offload target(mic) nocopy(hand1)

    {

        

        for(int i=0 ; i< NUM_THREADS; i++)

            DftiFreeDescriptor(&hand1[i]);

    }



}


Viewing all articles
Browse latest Browse all 1789

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>