Listing 1: compress.c — Byte pair encoding compression

/* compress.c -- Byte Pair Encoding compression */
/* Copyright 1996 Philip Gage */

#include <stdio.h>
#include <stdlib.h>

#define MAXSIZE 65535L  /* Input file buffer size */
#define HASHSIZE 8192   /* Hash table size, power of 2 */
#define THRESHOLD   3   /* Increase for speed, min 3 */

void compress (FILE *input, FILE *output)
{
  unsigned char *buffer, *left, *right, *count;
  unsigned char a, b, bestcount=0, pairtable[128][2];
  int i, j, index, bestindex, code=128;
  size_t size;

  /* Dynamically allocate buffers and check for errors */
  buffer = (unsigned char *)malloc(MAXSIZE);
  left = (unsigned char *)malloc(HASHSIZE);
  right = (unsigned char *)malloc(HASHSIZE);
  count = (unsigned char *)malloc(HASHSIZE);
  if (buffer==NULL || left==NULL ||
      right==NULL || count==NULL) {
    printf("Error allocating memory\n");
    exit(1);
  }

  /* Read input file into buffer and check for errors */
  size = fread(buffer,1,MAXSIZE,input);
  if (size == MAXSIZE) {
    printf("File too big\n");
    exit(1);
  }
  for (i=0; i<size; i++)
    if (buffer[i] > 127) {
      printf("This program works only on text files\n");
      exit(1);
    }

  do {  /* Replace frequent pairs with bytes 128..255 */

    /* Enter counts of all byte pairs into hash table */
    memset(count,0,HASHSIZE);
    for (i=0; i<size-1; i++) {
      a = buffer[i];
      b = buffer[i+1];
      index = (a ^ (b << 6)) & (HASHSIZE-1);
      while ((left[index] != a || right[index] != b) &&
             count[index] != 0)
        index = (index + 1) & (HASHSIZE-1);
      left[index] = a;
      right[index] = b;
      if (count[index] < 255)
        ++count[index];
    }

    /* Search hash table for most frequent pair */
    bestcount = THRESHOLD - 1;
    for (i=0; i<HASHSIZE; i++) {
      if (count[i] > bestcount) {
        bestcount = count[i];
        bestindex = i;
      }
    }

    /* Compress if enough occurrences of pair */
    if (bestcount >= THRESHOLD) {

      /* Add pair to table using code as index */
      a = pairtable[code-128][0] = left[bestindex];
      b = pairtable[code-128][1] = right[bestindex];

      /* Replace all pair occurrences with unused byte */
      for (i=0, j=0; i<size; i++, j++)
        if (a == buffer[i] && b == buffer[i+1]) {
          buffer[j] = code;
          ++i;
        }
        else
          buffer[j] = buffer[i];
      size = j;
    }
    else
      break;
  } while (++code < 255);

  /* Write pair count, pair table and compressed data */
  putc(code,output);
  fwrite(pairtable,2,code-128,output);
  fwrite(buffer,1,size,output);
  free(buffer); free(left); free(right); free(count);
}

int main (int argc, char **argv)
{
  FILE *in, *out;
  if (argc != 3)
    printf("Usage: compress inputfile outputfile\n");
  else if ((in=fopen(argv[1],"rb"))==NULL)
    printf("Error opening input %s\n",argv[1]);
  else if ((out=fopen(argv[2],"wb"))==NULL)
    printf("Error opening output %s\n",argv[2]);
  else {
    compress(in,out);
    fclose(out);
    fclose(in);
  }
  return 0;
}/* End of File */