Figure 1: What possible compiler optimizations might look like if done by hand

#define N 1000
     
/* Kernel with hoisting of invariant done by hand. */
void 
with_hoisting( float * restrict a, float * restrict b, 
    int n, int j ) {

    float t0 = (b[j+N/4] + b[j-N/4]) * 0.5f; 
    int i; 
    for( i=0; i<n; ++i ) 
        a[i] = t0;
}
     
/* 
 * Kernel with software pipelining done by hand.
 * The optimal pipelining depends upon the target machine. 
 * The example here is only one such way.  
 * It's mighty peculiar to do it in this example 
 * since b[..] can be hoisted, 
 * but nonetheless at least one compiler did something 
 * similar to this. 
 */
void 
with_software_pipelining( float * restrict a, float * restrict b, 
    int n, int j ) {

    int i; 
    float t0, t1, t2, t3;
    if( 3 <= n ) {
        /* prologue for pipelined loop */
        t0 = b[j+(N/4)];        /* Part of iteration i=0 */ 
        t1 = b[j-(N/4)];        /* "    "  "         i=0 */
     
        t2 = t0 + t1;           /* "    "  "         i=0 */ 
        t0 = b[j+(N/4)];        /* "    "  "         i=1 */ 
        t1 = b[j-(N/4)];        /* "    "  "         i=1 */
     
        t3 = 0.5f * t2;         /* Part of iteration i=0 */ 
        t2 = t0 + t1;           /* "    "  "         i=1 */ 
        t0 = b[j+(N/4)];        /* "    "  "         i=2 */ 
        t1 = b[j-(N/4)];        /* "    "  "         i=2 */
     
        /* The pipelined loop */
        for( i=3; i<n; ++i ) {
            /* Next five statements could be evaluated 
               in single step. */ 
            a[i-3] = t3;
            t3 = 0.5f * t2;
            t2 = t0 + t1;
            t0 = b[j+(N/4)];
            t1 = b[j-(N/4)];
        }
     
        /* epilogue for pipelined loop */
        a[n-3] = t3;            /* Part of iteration i=n-3 */ 
        t3 = 0.5f * t2;         /* "    "  "         i=n-2 */ 
        t2 = t0 + t1;           /* "    "  "         i=n-1 */
     
        a[n-2] = t3;            /* "    "  "         i=n-2 */ 
        t3 = 0.5f * t2;         /* "    "  "         i=n-1 */
     
        a[n-1] = t3;            /* "    "  "         i=n-1 */
    } else {
        // Not enough iterations to pipeline the loop
        for( i=0; i<n; ++i ) 
            a[i] = (b[j+N/4] + b[j-N/4]) * 0.5f;
    }   
}