Hi,
I want to get peak performance using batch FFT. I am giving sample code below that i have written. I have compiled on host using command icc -mkl fftcheck.cpp -o fftchecknew -L/opt/intel/mkl/lib/intel64. Also i have set some environment variables mentioned below.
export MIC_ENV_PREFIX=MIC
export MIC_KMP_AFFINITY=scatter,granularity=fine
export MIC_OMP_NUM_THREADS=240
export MIC_USE_2MB_BUFFERS=64K.
./fftchecknew 512 1024 1024.
output
1024 512 627.922913
Here number of operations= 2.5* M* N*(log2(M*N))*numberOfTransforms. here M = 1024, N=1024, numberOfTransforms = 512.
So gflops = operations/time = (26843545600/627.93* 10^6) = 42Gflops. I have tried all the optimizatons suggested in http://software.intel.com/en-us/articles/tuning-the-intel-mkl-dft-functi....
but there they are using different FFT function and getting 100+gflops. Is that the reason to have less gflops? Can some one please tell me How to improve performnace?
#pragma offload_attribute(push, target(mic))
#include <mkl.h>
#include<immintrin.h>
#pragma offload_attribute(pop)
#include <sys/time.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <omp.h>
#define NUM_THREADS 1
__declspec(target(mic)) float* sxdreal;
__declspec(target(mic)) float* sxdimag;
__declspec(target(mic)) DFTI_DESCRIPTOR_HANDLE hand1[NUM_THREADS];
int main(int argc, char* argv[])
{
int transforms = atoi(argv[1]);
int rows = atoi(argv[2]);
int cols = atoi(argv[3]);
#pragma offload target(mic) in(transforms, rows,cols) nocopy(sxdreal,sxdimag,hand1)
{
sxdreal = (float*)_mm_malloc(sizeof(float)*transforms*rows*(cols),64);
sxdimag = (float*)_mm_malloc(sizeof(float)*transforms*rows*(cols),64);
MKL_LONG status;
MKL_LONG N1[2]; N1[0] = rows; N1[1] = cols;
for(int i=0; i<NUM_THREADS;i++)
{
status = DftiCreateDescriptor(&hand1[i], DFTI_SINGLE, DFTI_COMPLEX, 2,N1);
status = DftiSetValue(hand1[i], DFTI_COMPLEX_STORAGE, DFTI_REAL_REAL);
status = DftiSetValue(hand1[i], DFTI_NUMBER_OF_TRANSFORMS, transforms);
status = DftiSetValue(hand1[i], DFTI_INPUT_DISTANCE, rows*(cols));
if (0 != status) printf("failed\n");
status = DftiCommitDescriptor(hand1[i]);
}
}
#pragma offload target(mic) nocopy(sxdreal,sxdimag,hand1)
{
MKL_LONG status;
status = DftiComputeForward(hand1[0],sxdreal, sxdimag);
}
double time = clock();
timeval m_timer;
gettimeofday(&m_timer, NULL);
#pragma offload target(mic) nocopy(sxdreal,sxdimag,hand1)
{
MKL_LONG status;
int thread = 0;//omp_get_thread_num();
for(int i=0; i< 10; i++)
status = DftiComputeForward(hand1[thread],sxdreal, sxdimag);
if (0 != status) printf("failed\n");
}
timeval now;
gettimeofday(&now,NULL);
double secs = now.tv_sec-m_timer.tv_sec;
double usecs = now.tv_usec-m_timer.tv_usec;
if (usecs<0)
{
secs--;
usecs+=1000000;
}
float diff = secs*1000+usecs/1000+0.5;
double remain = clock()-time;
printf("%d %d %f\n",rows, transforms,diff/10);
#pragma offload target(mic) nocopy(hand1)
{
for(int i=0 ; i< NUM_THREADS; i++)
DftiFreeDescriptor(&hand1[i]);
}
}