Listing 4
template <class T>
class AlignedArray
{
public:
AlignedArray( unsigned int numElements )
: _mNumElements( ( int )numElements )
{
_mData = AllocMem( numElements );
}
AlignedArray( const AlignedArray<T>& array )
: _mNumElements( array.NumElements() )
_mData = AllocMem( NumElements() );
*this = array;
}
virtual ~AlignedArray() { if( _mData ) delete _mData; }
int NumElements() const { return( _mNumElements ); }
AlignedArray<T>& operator=( const AlignedArray<T>& array )
{
int n = this->NumElements() <? array.NumElements();
for( int i = 0; i < n; ++i )
( *this )[i] = array[i];
return( *this );
}
T operator[]( int idx ) const
{
assert( ( idx >= 0 ) && ( idx < _mNumElements ) );
return( _mData[idx] );
}
T& operator[]( int idx )
{
assert( ( idx >= 0 ) && ( idx < _mNumElements ) );
return( _mData[idx] );
}
T ScalarProduct( const AlignedArray<T>& array )
{
int n = this->NumElements() <? array.NumElements();
T ans = 0.0;
for( int i = 0; i < n; ++i )
ans += ( *this )[i] * array[i];
return( ans );
}
// ...other operators/functions not shown
protected:
operator T*() const { return( _mData ); }
private:
T* AllocMem( int numElements )
{
char *ptr = new char[numElements * sizeof( T ) + 16];
int offset = ( int )ptr & 0xf;
if( offset )
ptr = ( char* )( ( unsigned int )ptr + 16 - offset );
return( ( T* )ptr );
}
int _mNumElements;
T* _mData;
};
// SSE 'packed single' implementation
template <> float
AlignedArray<float>::ScalarProduct( const AlignedArray<float>& array )
{
float *data1 = *this, *data2 = array;
float ans[4] __attribute__ ((aligned(16)));
int n = this->NumElements() <? array.NumElements();
register int i;
if( n >= 8 )
{
__asm__ __volatile__(
"xorps %%xmm0, %%xmm0"
: /* outputs */
: /* inputs */
: /* clobbered */ "xmm0" );
for( i = 0; i < ( n >> 3 ); ++i )
{
__asm__ __volatile__(
"movaps (%0), %%xmm1\n\t"
"movaps 16(%0), %%xmm2\n\t"
"mulps (%1), %%xmm1\n\t"
"mulps 16(%1), %%xmm2\n\t"
"add $32,%0\n\t"
"add $32,%1\n\t"
"addps %%xmm2, %%xmm1\n\t"
"addps %%xmm1, %%xmm0"
: /* outputs */ "+r" ( data1 ), "+r" ( data2 )
: /* inputs */
: /* clobbered */ "xmm0", "xmm1", "xmm2" );
}
__asm__ __volatile__(
"movaps %%xmm0, %0"
: /* outputs */ "=m" ( ans )
: /* inputs */
: /* clobbered */ "xmm0", "memory" );
n -= i << 3;
ans[0] += ans[1] + ans[2] + ans[3];
}
else
ans[0] = 0.0;
for( i = 0; i < n; ++i )
ans[0] += data1[i] * data2[i];
return( ans[0] );
}
// SSE2 'packed double' implementation
template <> double
AlignedArray<double>::ScalarProduct( const AlignedArray<double>& array )
{
double *data1 = *this, *data2 = array;
double ans[2] __attribute__ ((aligned(16)));
int n = this->NumElements() <? array.NumElements();
register int i;
if( n >= 4 )
{
__asm__ __volatile__(
"xorpd %%xmm0, %%xmm0"
: /* outputs */
: /* inputs */
: /* clobbered */ "xmm0" );
for( i = 0; i < ( n >> 2 ); ++i )
{
__asm__ __volatile__(
"movapd (%0), %%xmm1\n\t"
"movapd 16(%0), %%xmm2\n\t"
"mulpd (%1), %%xmm1\n\t"
"mulpd 16(%1), %%xmm2\n\t"
"add $32,%0\n\t"
"add $32,%1\n\t"
"addpd %%xmm2, %%xmm1\n\t"
"addpd %%xmm1, %%xmm0"
: /* outputs */ "+r" ( data1 ), "+r" ( data2 )
: /* inputs */
: /* clobbered */ "xmm0", "xmm1", "xmm2" );
}
__asm__ __volatile__(
"movapd %%xmm0, %0"
: /* outputs */ "=m" ( ans )
: /* inputs */
: /* clobbered */ "xmm0", "memory" );
n -= i << 2;
ans[0] += ans[1];
}
else
ans[0] = 0.0;
for( i = 0; i < n; ++i )
ans[0] += data1[i] * data2[i];
return( ans[0] );
}