Listing 3
float ScalarProduct( float* a1, float* a2, int n )
{
float ans[4] __attribute__ ((aligned(16)));
register int i;
if( n >= 8 )
{
__asm__ __volatile__(
"xorps %%xmm0, %%xmm0"
: /* outputs */
: /* inputs */
: /* clobbered */ "xmm0" );
for( i = 0; i < ( n >> 3 ); ++i )
{
__asm__ __volatile__(
"movups (%0), %%xmm1\n\t"
"movups 16(%0), %%xmm2\n\t"
"movups (%1), %%xmm3\n\t"
"movups 16(%1), %%xmm4\n\t"
"add $32,%0\n\t"
"add $32,%1\n\t"
"mulps %%xmm3, %%xmm1\n\t"
"mulps %%xmm4, %%xmm2\n\t"
"addps %%xmm2, %%xmm1\n\t"
"addps %%xmm1, %%xmm0"
: /* outputs */ "+r" ( a1 ), "+r" ( a2 )
: /* inputs */
: /* clobbered */ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" );
}
__asm__ __volatile__(
"movaps %%xmm0, %0"
: /* outputs */ "=m" ( ans )
: /* inputs */
: /* clobbered */ "xmm0", "memory" );
n -= i << 3;
ans[0] += ans[1] + ans[2] + ans[3];
}
else
ans[0] = 0.0;
for( i = 0; i < n; ++i )
ans[0] += a1[i] * a2[i];
return( ans[0] );
}