Quantcast
Channel: Intel® Software - Intel® Many Integrated Core Architecture (Intel MIC Architecture)
Viewing all articles
Browse latest Browse all 1789

L2_DATA_WRITE_MISS_MEM_FILL during sequential memory access

$
0
0

Hey all,

I'm currently analyzing a fully vectorized and parallelized program with vtune. It is written in Intrinsics and OpenMP pragmas. The vtune general exploration showed, that the program has some problems while loading data into a register. I have added a screenshot of my vtune analysis to the attachment, which shows the relevent hardware counters (vt1.png) . The program accesses two large float arrays (allocated with mmap and 2MB pagesize) in a sequential order, then performs a lot alignr, fmadd instructions and stores the result in a non-sequential order into another array (stores have less than 1% latency impact). The code is sketched below.


omp_set_num_threads(240);
#pragma omp parallel for schedule(static)
for ( std::size_t itr = 0; itr < MAX; ++itr ) {

        //MAX is large .. so each HW thread has far more than 512 itr 

        __m512 A0_, A1_, A2_, A3_, A4_, A5_;
        __m512 B0_, B1_, B2_, B3_, B4_, B5_, B6_, B7_, B8_;
        __m512 B9_, B10_, B11_, B12_, B13_, B14_, B16_, B17_;

        //L2 A prefetch for next iteration
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 0*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 1*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 2*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 3*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 4*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&A[ 16*6 * (itr+1) + 5*16 ], _MM_HINT_T1 );

        //L1 A prefetch
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 0*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 1*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 2*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 3*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 4*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&A[ 16*6 * itr + 5*16 ], _MM_HINT_T0 );

        imem_A = 16*6 * itr;
        A0_ = _mm512_load_ps( A + imem_A + 0*16 );
        A1_ = _mm512_load_ps( A + imem_A + 1*16 );
        A2_ = _mm512_load_ps( A + imem_A + 2*16 );
        A3_ = _mm512_load_ps( A + imem_A + 3*16 );
        A4_ = _mm512_load_ps( A + imem_A + 4*16 );
        A5_ = _mm512_load_ps( A + imem_A + 5*16 );

        //L1 B prefetch
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 0*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 1*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 2*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 3*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 4*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 5*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 6*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 7*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 8*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 9*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 10*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 12*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 13*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 14*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 15*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 16*16 ], _MM_HINT_T0 );
        _mm_prefetch( (const char *)&B[ 16*18 * itr + 17*16 ], _MM_HINT_T0 );

        imem_B = 16*18 * itr;
        B0_  = _mm512_load_ps( B + imem_B + 0*16 );
        B1_  = _mm512_load_ps( B + imem_B + 1*16 );
        B2_  = _mm512_load_ps( B + imem_B + 2*16 );
        B3_  = _mm512_load_ps( B + imem_B + 3*16 );
        B4_  = _mm512_load_ps( B + imem_B + 4*16 );
        B5_  = _mm512_load_ps( B + imem_B + 5*16 );
        B6_  = _mm512_load_ps( B + imem_B + 6*16 );
        B7_  = _mm512_load_ps( B + imem_B + 7*16 );
        B8_  = _mm512_load_ps( B + imem_B + 8*16 );
        B9_  = _mm512_load_ps( B + imem_B + 9*16 );
        B10_ = _mm512_load_ps( B + imem_B + 10*16 );
        B11_ = _mm512_load_ps( B + imem_B + 11*16 );
        B12_ = _mm512_load_ps( B + imem_B + 12*16 );
        B13_ = _mm512_load_ps( B + imem_B + 13*16 );
        B14_ = _mm512_load_ps( B + imem_B + 14*16 );
        B15_ = _mm512_load_ps( B + imem_B + 15*16 );
        B16_ = _mm512_load_ps( B + imem_B + 16*16 );
        B17_ = _mm512_load_ps( B + imem_B + 17*16 );


        /////////////////////////////////////////////////////////////
        do 24 _mm512_mask_alignr_epi32 involving all __m512 variables
        /////////////////////////////////////////////////////////////


        //L2 B prefetch for next iteration
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 0*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 1*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 2*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 3*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 4*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 5*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 6*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 7*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 8*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 9*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 10*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 11*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 12*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 13*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 14*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 15*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 16*16 ], _MM_HINT_T1 );
        _mm_prefetch( (const char *)&B[ 16*18 * (itr+1) + 17*16 ], _MM_HINT_T1 );


        ///////////////////////////////////////////////////////////////////////////////////////
        do 36 _mm512_fnmadd_ps, _mm512_fmadd_ps, _mm512_fmsub_ps involving all __m512 variables
        //  results saved in __m512 C0_, C1_, C2_, C3_, C4_, C5_;
        ///////////////////////////////////////////////////////////////////////////////////////


        imem_store = 16*6 * store_order[ itr ];
        _mm512_storenrngo_ps( C + imem_store + 0*16, C0_ );
        _mm512_storenrngo_ps( C + imem_store + 1*16, C1_ );
        _mm512_storenrngo_ps( C + imem_store + 2*16, C2_ );
        _mm512_storenrngo_ps( C + imem_store + 3*16, C3_ );
        _mm512_storenrngo_ps( C + imem_store + 4*16, C4_ );
        _mm512_storenrngo_ps( C + imem_store + 5*16, C5_ );
}

I have also tried a lot of different L1 and L2 prefetches (2, 3, 4... loop iterations before the data is needed). My program reaches "best" performance with the above sketched prefetches. It is 100% faster than without prefetching. The vtune analysis showed that loading data from array B ( B = link in v1.png) has a major latency impact due to L1 and L2 cache misses.

I really can't understand why the program has problems with loading the data into a register at all. It just performs a sequential memory access...

I have already read tuning suggestions in the article, which doesn't help me.

http://software.intel.com/en-us/articles/optimization-and-performance-tu...

Where is my mistake? 

 

Hope you can help,

Patrick

AttachmentSize
Downloadvt1_0.png747.53 KB

Viewing all articles
Browse latest Browse all 1789

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>