(a)
     xor     eax, eax         ; reset loop counter
 L:  movups  xmm0, _x[eax+4]   ; load 4 SP elements from x into xmm0 register
     movups  xmm1, _y[eax+4]   ; load 4 SP elements from y into xmm0 register
     addps   xmm0, xmm1       ; add 4 SP elements 
     movaps  _x[eax], xmm1     ; store 4 SP elements from xmm0 register into x
     add     eax, 16
     cmp     eax, 336
     jl      L                ; looping logic (iterates 22 times)

(b)
     for (i = 1; i < N; i++) {              for (i = 1; i < N; i++) {   
S10:    a[i] = b[i-1];                S11:    b[i] = c[i] * 2.0;        
S11:    b[i] = c[i] * 2.0;     <->    S10:    a[i] = b[i-1];    
     }                                     }

(c) 
     short u[N], v[N]; int w;
     ...
     for (w = 0, i = 0; i < N; i++) {
S12:     w = w + u[i] * v[i];
    }

(d)
xor     eax, eax                   ; reset loop counter
        pxor      xmm0, xmm0       ; initialize accumulator xmm0 to | 0  0
0  0 |

L:      movdqa    xmm1, _u[eax]    ; load 8 shorts from u into xmm1 and
        pmaddwd   xmm1, _v[eax]    ;  and multiply/add with 8 shorts from v
        paddd     xmm0, xmm1       ;   and accumulate 4 resulting integers
into
        add       eax, 16          ;    the 4 partial sums in accumulator
        cmp       eax, 2*N         ;
        jl        L                ; looping logic (iterates N/8 times)

Example 3: More auto-vectorization.

Back to Article