Listing 1 The Boyer and Moore algorithm

/************************************************
 *    Fast String Search
 * Using a tuned Boyer and Moore algorithm.
 * This source code is written by Erick Otto. Algorithms from
 * Daniel Sunday and Andrew Hume are used in this code.
 * 1993 by Erick Otto.
 */

#include    <stdio.h>
#include    <fcntl.h>
#include    "freq.h"

/* Size of text buffer, BUFSIZ */
/* is optimum from stdio.h     */
#define TBUF (8*BUFSIZ)

#define MAXPAT 256

/* We use the following variables for the search */
/* pat    : The pattern                          */
/* dist   : A table which contains the offsets   */
/*          for every char in the pattern from   */
/*          the end of the pattern               */
/* lfc    : The least frequent char in the patt  */
/* lfcoff : The offset of lfc from end of patt   */
/* shift  : The leftward re-occurance of the     */
/*          last charater in positions from the  */
/*          end of the patt                      */

int patlen;
unsigned char pat[MAXPAT];
unsigned char dist[MAXPAT];
int lfc, lfcoff;
int shift;

main(argc, argv)
int argc;
char **argv;
{
   char buf[TBUF+MAXPAT];
   char match[MAXPAT];
   register int fd, nread;
   register char *beg, *lastnl, *end;
   char filename[80];
   
   /* must be at least one arguments */
   if (argc < 2 ) {
      fprintf(stderr,"\n\tMatch string expected\n");
      exit();
   }
   if (argc < 3 ) {
      fprintf(stderr,"\n\tFilename expected\n");
      exit();
   }
   
   strcpy(match, argv[1]);
   strcpy(filename, argv[2]);
   build(match, strlen(match));
   
   if ((fd = open(filename,O_RDONLY)) == -1) {
      fprintf(stderr,
         "Can't open file %s\n", filename);
      perror(filename);
      exit(1);
   }
   
   *buf = '\n'; /* sentinel for printing purposes */
   beg = buf+1;
   
   while((nread=read(fd,beg,(&buf[TBUF]-beg)))>0){
      end = beg+nread;
      lastnl = end;
      while(*--lastnl != '\n') ;
      
      /* lastnl points to the newline */
      matchpat(buf+1, lastnl-buf);
      
      memcpy(buf+1, lastnl, end-lastnl-1);
      beg = buf + 1 + (end-lastnl);
   }
   close(fd);
}

build(match, m)
unsigned char *match;
register m;
{
   register unsigned char *patend, *patptr;
   register unsigned char *d;
   register int i;
   register unsigned char *pos;
   unsigned char *lastchar;
   int lfcidx;
   
   if(m > MAXPAT) {
      printf("Length of the pattern too long!\n");
      exit(1);
   }
   
   /* initialize the pattern variables */
   patlen = m;
   memcpy(pat, match, patlen);
   
   /* build the dist table */
   d = dist;
   
   /* initialize all elements to the pattern length */
   for(i = 0; i < MAXPAT; i++)
      d[i] = patlen;
   
   /* set all characters in the pattern to their     */
   /* relative distances from the end of the pattern */
   
   patptr = pat;
   patend = patptr+patlen-1;
   
   while(patptr < patend) {
      d[*patptr] = patend-patptr;
      patptr++;
   }
   d[*patptr] = 0;
   
   /* find the least frequent character */
   /* that occurs in the pattern        */
   
   lfcidx = 0;
   for(i = 1; i < patlen; i++){
      if(freq[pat[i]] < freq[pat[lfcidx]])
         lfcidx = i;
   }
   
   /* set the lf char and offset for later use */
   lfc = pat[lfcidx];
   lfcoff = lfcidx - (patlen-1);
   
   /* Look backward and see if we can find an other   */
   /* occurance of the same character as the last one */
   lastchar = patptr;
   for(pos = lastchar-1; pos >= pat; pos--)
      if (*pos == *lastchar) break;
   
   /* record the occurance for later use */
   shift = lastchar - pos;
}

matchpat(text, n)
unsigned char *text;
int n;
{
   register unsigned char *e, *s;
   register unsigned char *dp;
   register int k;
   register int lfco, lfcc;
   register unsigned char *p, *q;
   register unsigned char *ep;
   register char *sp;
   register char *nl;
   register int t1;
   char save[MAXPAT];
   
   dp = dist;
   t1 = patlen-1;
   sp = save;
   s = text+t1;
   e = text+n;
   
   memcpy(sp, e, patlen);
   memset(e, pat[t1], patlen);
   lfco = lfcoff;
   lfcc = lfc;
   ep = pat + t1;
   
   while(s < e){
   
      /* fast forward through the text untill we find */
      /* the last char of the pattern (unrolled loop) */
      
      k = dp[*s];
      while(k){
         k = dp[*(s += k)];
         k = dp[*(s += k)];
         k = dp[*(s += k)];
      }
      
      /* If we hit the sentinel at the end of   */
      /* the buffer we are done with the buffer */
      if(s >= e)
         break;
      
      /* Neat little trick, actually makes things */
      /* faster. Do a pre check on the lfc,       */
      /* before starting the full comparison      */
      if(s[lfco] != lfcc)
         goto mismatch;
      
      /* normal forward string matching */
      for(p = pat, q = s-t1; p < ep; ){
         if(*q++ != *p++)
            goto mismatch;
      }
      
      /** A match has been found **/
      /*  at position q - patlen  */
      
      nl=q;
      
      /* look back for the closest newline */
      
      while(*--nl != '\n')
         ;
      while(*++nl != '\n') putchar(*nl);
      putchar('\n');
      
      /* we are now at q which is somewhere in the
       * text we reported a match by printing the
       * line, so any possible other matches will
       * NOT have to be searched for in this line
       * so forward to the end of line
       */
       q=nl;
      
   mismatch:
      /* use the shift that we calculated */
      s += shift;
   }
   memcpy(e, sp, patlen);
   return(0);
}

/* End of File */