Hey all,
I'm currently analyzing a fully vectorized and parallelized program with vtune. It is written in Intrinsics and OpenMP pragmas. The vtune general exploration showed, that the program has some problems while loading data into a register. I have added a screenshot of my vtune analysis to the attachment, which shows the relevent hardware counters (vt1.png) . The program accesses two large float arrays (allocated with mmap and 2MB pagesize) in a sequential order, then performs a lot alignr, fmadd instructions and stores the result in a non-sequential order into another array (stores have less than 1% latency impact). The code is sketched below.
omp_set_num_threads(240); #pragma omp parallel for schedule(static) for ( std::size_t itr = 0; itr < MAX; ++itr ) { //MAX is large .. so each HW thread has far more than 512 itr __m512 A0_, A1_, A2_, A3_, A4_, A5_; __m512 B0_, B1_, B2_, B3_, B4_, B5_, B6_, B7_, B8_; __m512 B9_, B10_, B11_, B12_, B13_, B14_, B16_, B17_; //L2 A prefetch for next iteration _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 0*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 1*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 2*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 3*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 4*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 5*16 ], _MM_HINT_T1 ); //L1 A prefetch _mm_prefetch( (const char *)&A[ 16*6 * itr + 0*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&A[ 16*6 * itr + 1*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&A[ 16*6 * itr + 2*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&A[ 16*6 * itr + 3*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&A[ 16*6 * itr + 4*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&A[ 16*6 * itr + 5*16 ], _MM_HINT_T0 ); imem_A = 16*6 * itr; A0_ = _mm512_load_ps( A + imem_A + 0*16 ); A1_ = _mm512_load_ps( A + imem_A + 1*16 ); A2_ = _mm512_load_ps( A + imem_A + 2*16 ); A3_ = _mm512_load_ps( A + imem_A + 3*16 ); A4_ = _mm512_load_ps( A + imem_A + 4*16 ); A5_ = _mm512_load_ps( A + imem_A + 5*16 ); //L1 B prefetch _mm_prefetch( (const char *)&B[ 16*18 * itr + 0*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 1*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 2*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 3*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 4*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 5*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 6*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 7*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 8*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 9*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 10*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 12*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 13*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 14*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 15*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 16*16 ], _MM_HINT_T0 ); _mm_prefetch( (const char *)&B[ 16*18 * itr + 17*16 ], _MM_HINT_T0 ); imem_B = 16*18 * itr; B0_ = _mm512_load_ps( B + imem_B + 0*16 ); B1_ = _mm512_load_ps( B + imem_B + 1*16 ); B2_ = _mm512_load_ps( B + imem_B + 2*16 ); B3_ = _mm512_load_ps( B + imem_B + 3*16 ); B4_ = _mm512_load_ps( B + imem_B + 4*16 ); B5_ = _mm512_load_ps( B + imem_B + 5*16 ); B6_ = _mm512_load_ps( B + imem_B + 6*16 ); B7_ = _mm512_load_ps( B + imem_B + 7*16 ); B8_ = _mm512_load_ps( B + imem_B + 8*16 ); B9_ = _mm512_load_ps( B + imem_B + 9*16 ); B10_ = _mm512_load_ps( B + imem_B + 10*16 ); B11_ = _mm512_load_ps( B + imem_B + 11*16 ); B12_ = _mm512_load_ps( B + imem_B + 12*16 ); B13_ = _mm512_load_ps( B + imem_B + 13*16 ); B14_ = _mm512_load_ps( B + imem_B + 14*16 ); B15_ = _mm512_load_ps( B + imem_B + 15*16 ); B16_ = _mm512_load_ps( B + imem_B + 16*16 ); B17_ = _mm512_load_ps( B + imem_B + 17*16 ); ///////////////////////////////////////////////////////////// do 24 _mm512_mask_alignr_epi32 involving all __m512 variables ///////////////////////////////////////////////////////////// //L2 B prefetch for next iteration _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 0*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 1*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 2*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 3*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 4*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 5*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 6*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 7*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 8*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 9*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 10*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 11*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 12*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 13*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 14*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 15*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 16*16 ], _MM_HINT_T1 ); _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 17*16 ], _MM_HINT_T1 ); /////////////////////////////////////////////////////////////////////////////////////// do 36 _mm512_fnmadd_ps, _mm512_fmadd_ps, _mm512_fmsub_ps involving all __m512 variables // results saved in __m512 C0_, C1_, C2_, C3_, C4_, C5_; /////////////////////////////////////////////////////////////////////////////////////// imem_store = 16*6 * store_order[ itr ]; _mm512_storenrngo_ps( C + imem_store + 0*16, C0_ ); _mm512_storenrngo_ps( C + imem_store + 1*16, C1_ ); _mm512_storenrngo_ps( C + imem_store + 2*16, C2_ ); _mm512_storenrngo_ps( C + imem_store + 3*16, C3_ ); _mm512_storenrngo_ps( C + imem_store + 4*16, C4_ ); _mm512_storenrngo_ps( C + imem_store + 5*16, C5_ ); }
I have also tried a lot of different L1 and L2 prefetches (2, 3, 4... loop iterations before the data is needed). My program reaches "best" performance with the above sketched prefetches. It is 100% faster than without prefetching. The vtune analysis showed that loading data from array B ( B = link in v1.png) has a major latency impact due to L1 and L2 cache misses.
I really can't understand why the program has problems with loading the data into a register at all. It just performs a sequential memory access...
I have already read tuning suggestions in the article, which doesn't help me.
http://software.intel.com/en-us/articles/optimization-and-performance-tu...
Where is my mistake?
Hope you can help,
Patrick