(a)
xor eax, eax ; reset loop counter
L: movups xmm0, _x[eax+4] ; load 4 SP elements from x into xmm0 register
movups xmm1, _y[eax+4] ; load 4 SP elements from y into xmm0 register
addps xmm0, xmm1 ; add 4 SP elements
movaps _x[eax], xmm1 ; store 4 SP elements from xmm0 register into x
add eax, 16
cmp eax, 336
jl L ; looping logic (iterates 22 times)
(b)
for (i = 1; i < N; i++) { for (i = 1; i < N; i++) {
S10: a[i] = b[i-1]; S11: b[i] = c[i] * 2.0;
S11: b[i] = c[i] * 2.0; <-> S10: a[i] = b[i-1];
} }
(c)
short u[N], v[N]; int w;
...
for (w = 0, i = 0; i < N; i++) {
S12: w = w + u[i] * v[i];
}
(d)
xor eax, eax ; reset loop counter
pxor xmm0, xmm0 ; initialize accumulator xmm0 to | 0 0
0 0 |
L: movdqa xmm1, _u[eax] ; load 8 shorts from u into xmm1 and
pmaddwd xmm1, _v[eax] ; and multiply/add with 8 shorts from v
paddd xmm0, xmm1 ; and accumulate 4 resulting integers
into
add eax, 16 ; the 4 partial sums in accumulator
cmp eax, 2*N ;
jl L ; looping logic (iterates N/8 times)
Example 3: More auto-vectorization.
Back to Article