#define N 1000
/* Kernel with hoisting of invariant done by hand. */
void
with_hoisting( float * restrict a, float * restrict b,
int n, int j ) {
float t0 = (b[j+N/4] + b[j-N/4]) * 0.5f;
int i;
for( i=0; i<n; ++i )
a[i] = t0;
}
/*
* Kernel with software pipelining done by hand.
* The optimal pipelining depends upon the target machine.
* The example here is only one such way.
* It's mighty peculiar to do it in this example
* since b[..] can be hoisted,
* but nonetheless at least one compiler did something
* similar to this.
*/
void
with_software_pipelining( float * restrict a, float * restrict b,
int n, int j ) {
int i;
float t0, t1, t2, t3;
if( 3 <= n ) {
/* prologue for pipelined loop */
t0 = b[j+(N/4)]; /* Part of iteration i=0 */
t1 = b[j-(N/4)]; /* " " " i=0 */
t2 = t0 + t1; /* " " " i=0 */
t0 = b[j+(N/4)]; /* " " " i=1 */
t1 = b[j-(N/4)]; /* " " " i=1 */
t3 = 0.5f * t2; /* Part of iteration i=0 */
t2 = t0 + t1; /* " " " i=1 */
t0 = b[j+(N/4)]; /* " " " i=2 */
t1 = b[j-(N/4)]; /* " " " i=2 */
/* The pipelined loop */
for( i=3; i<n; ++i ) {
/* Next five statements could be evaluated
in single step. */
a[i-3] = t3;
t3 = 0.5f * t2;
t2 = t0 + t1;
t0 = b[j+(N/4)];
t1 = b[j-(N/4)];
}
/* epilogue for pipelined loop */
a[n-3] = t3; /* Part of iteration i=n-3 */
t3 = 0.5f * t2; /* " " " i=n-2 */
t2 = t0 + t1; /* " " " i=n-1 */
a[n-2] = t3; /* " " " i=n-2 */
t3 = 0.5f * t2; /* " " " i=n-1 */
a[n-1] = t3; /* " " " i=n-1 */
} else {
// Not enough iterations to pipeline the loop
for( i=0; i<n; ++i )
a[i] = (b[j+N/4] + b[j-N/4]) * 0.5f;
}
}