Listing 2 zone.c

***************************************************************
 * file: ZONE.C
 * purpose: text region detection via cellular automata
 *************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "zone.h"

/* local data */
static ZONE zone_list[MAX_ZONES];  /* this could be dynamically allocated */
static int zone_count;

/* local prototypes */
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley);
static void cut_vertical_lines(unsigned char *image,int dx,int dy);
static void block_zones(unsigned char *image,int dx,int/, dy,int coarseness);
static void sequence_zones(unsigned char *image,int dx,int dy,int order);
static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE
*zone_ptr);
static void overlap_zones(ZONE *zone_array,int *array_size);

/***********************************************
 * function: int zone(int coarseness)
 *  The process steps are:
 *      1) Sample the page
 *      2) Cut vertical lines from the page
 *      3) Block out zones via cellular automata
 *      4) Extract the zones
 *      5) Sequence zones
 * parameters: coarseness value 0-5, order (COLUMN or ROW)
 * returns: 1 if OK or 0 if error, see errno
 *********************************************/
int zone(int coarseness,int order)
   {
   unsigned char *image;
   int dx,dy;
   int samplex,sampley;
   int i;

   if (coarseness < 0  coarseness > 5)
      {               /* test coarseness parameter */
      errno = EINVAL;
      return 0;
      }
                    /* get a scaled copy of the page */
   image = sample_page(&dx,&dy,&samplex,&sampley);
   if (image == NULL)                      /* memory? */
      return 0;
#if CUT_VERTICAL_LINES
   cut_vertical_lines(image,dx,dy);        /* remove boxes around text */
#endif
   block_zones(image,dx,dy,coarseness);    /* block out zones */
   sequence_zones(image,dx,dy,order);
   for (i = 0 ; i < zone_count ; i++)
      {                                   /* translate to full page */
      zone_list[i].x1 *= samplex;
      zone_list[i].y1 *= sampley;
      zone_list[i].x2 *= samplex;
      zone_list[i].y2 *= sampley;
      }
   free(image);                            /* clean up */
   return 1;
   }

/**************** LOCAL FUNCTIONS ****************/

/************************************************
 * function: static unsigned char *sample_page(int *dx,int *dy,int *samplex,int
 * sampley)
 * Sample the page. Normally, the entire page is stored in memory. Since the
 * memory requirements are typically a megabyte, the page, in DOS machines,
 * is somewhere in extended memory. So that this demo can work on machines
 * lacking extended memory, I sampled the page when I scaled it for display.
 * See display_sample() below. However, I have #ifed around the code that is
 * normally used to sample the page from the memory image. You need to provide
 * two functions, as well as the extended memory handler. They are void
 * memory_info(DISPLAY_BOX *); which gets the x/y extents of the image, and
 * (unsigned char * memory_ptr(int y); which returns a pointer to a scan line
 * in the memory image. Sample_page() creates a standardized, reduced image
 * suitable for cellular automation. The sample has each byte, not bit,
 * representing a pixel area. The byte is 0xff if black or 0x00 if white. The
 * sampling procedure is important for region finding. If possible it should be
 * a function of the density of the original image. If the image isn't square,
 * for example a 200x100 fax file, then the x and y sampling should be adjusted
 * accordingly. Since I don'thave dpi information here, I am scaling it to a
 * typical, 200dpi, value after it was adjusted for screen display.
 * Note: Bit 7 is leftmost and 0 is rightmost; 1 bits are black, 0 are white.
 * parameters: pointers to storage for the byte width and height of the sampled
 *              image and the sample bit distance in x and y
 * returns: pointer to sampled page or NULL if no memory
 **************************************************/
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley)
   {
   static unsigned char bit_mask[] = {0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01};
   unsigned char *image,*line_ptr,*line_ptr2,*buff_ptr;
   DISPLAY_BOX file;
   unsigned int x,y,width,height;

   memory_info(&file);     /* need to provide this, gets file dimensions */
                       /* from memory image of file */
   *samplex = SAMPLE_200_X;        /* sample sizes */
   *sampley = SAMPLE_200_Y;
   *dx = file.width  / *samplex;   /* extent of sample */
   *dy = file.height  / *sampley;
   while (((long)*dx * (long)*dy) > MAX_MALLOC)
      {           /* adjust sampling to fit memory restrictions */
      (*samplex)++;
      (*sampley)++;
      *dx = file.width / *samplex;
      *dy = file.height / *sampley;
      }
   if ((image = malloc(*dx * *dy)) == NULL)    /* allocate sample buffer */
      return NULL;
   memset(image,WHITE,*dx * *dy);          /* set to white */
   width =*dx * *samplex;
   height = *dy * *sampley;
   if (*samplex >= 8)
      {                                   /* byte sampling */
      for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
         {                               /* for each y sample */
                   /* need to provide memory_ptr which gets a pointer
                    * to a scan line from the memory image of the file */
         line_ptr = memory_ptr(y);
         line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
         for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
            if (*(line_ptr+(x>>3))  *(line_ptr2+(x>>3)))
               *buff_ptr = BLACK;  /* if byte has black, set sample */
         }
      }
   else
      {                                   /* bit sampling */
      for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
         {                               /* for each y sample */
                   /* need to provide memory_ptr which gets a pointer
                    * to a scan line from the memory image of the file */
         line_ptr = memory_ptr(y);
         line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
         for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
            if ((*(line_ptr+(x>>3))  *(line_ptr2+(x>>3))) & bit_mask[x&7])
               *buff_ptr = BLACK;      /* if bit is black, set sample */
         }
      }
   return image;
   }

#if CUT_VERTICAL_LINES
/***********************************************
 * function: static void cut_vertical_lines(unsigned char *image,int dx,int dy)
 * Remove vertical lines from sample. The purpose of this function is to
 * unbox text. Removing the vertical box lines accomplishes this. Trying
 * to remove the horizontal lines is dangerous because you might also remove
 * the text.
 * parameters: pointer to sampled image buffer and x/y extents of buffer
 * returns: nothing
 *************************************************/
static void cut_vertical_lines(unsigned char *image,int dx,int dy)
   {
   int x,y,count,y1;
   unsigned char *ptr,*qtr;

   for (x = 0 ; x < dx ; x++)      /* scan image left to right */
      {
      ptr = image+x;
      count = 0;
      for (y = 0 ; y < dy ; y++, ptr += dx)
         {                   /* scan up and down counting line pixels */
         if (*ptr)
            count++;
         else
            count = 0;

         if (count >= VERTICAL_LINE_SIZE)
            {               /* we have a veritcal line */
            for (y1=y, qtr=ptr ; *ptr!=WHITE && y>=0 ; y--, ptr-=dx)
               *ptr = WHITE;    /* white out moving up */
            for (y=y1+1, ptr=qtr+dx ; *ptr!=WHITE && y<dy ; ptr+=dx)
               *ptr = WHITE;    /* white out moving down */
            count = 0;      /* reset counter */
            }
         }
      }
   }
#endif

/***********************************************
 * function: static void block_zones(unsigned char *image,int dx,int dy,int
 * coarseness)
 *  Use cellular automata to create blocks from image. Each block represents
 *  a region of text. There are two steps to the process. First, use cellular
 *  automata to sketch the region. Second, use the coarseness (0-5) to coalesce
 *  the regions. A higher coarseness value will make larger zones.
 * parameters: pointer to image buffer, x/y extents of buffer and coarseness
 *             of zoning (0-5)
 * returns: nothing
 ************************************************/
static void block_zones(unsigned char *image,int dx,int dy,int coarseness)
   {
   int i,j,x,y,score;
   unsigned char *line,*pixel;

   for (y=1, line=image+dx ; y < dy-1 ; y++, line+=dx)
      {   /* walk through buffer scanning for neighbors */
      for (x=1, pixel=line+1 ; x < dx-1 ; x++, pixel++)
         {
         score = 0;
         if (*(pixel-1) & ALIVE)   /* left */
            score++;
         if (*(pixel+1) & ALIVE)    /* right */
            score++;
         if (*(pixel-dx) & ALIVE)  /* up */
            score++;
         if (*(pixel+dx) & ALIVE)   /* down */
            score++;
         if (*pixel == WHITE && score >= LIFE_SCORE)
            *pixel = PREGNANT;
         else if (*pixel == BLACK && score <= DEATH_SCORE)
            *pixel = SICK;
         }
      }
   for (pixel = image ; pixel < image + dy * dx ; pixel++)
      {    /* birth and bury */
      if (*pixel == PREGNANT)
         *pixel = BLACK;
      else if (*pixel == SICK)
         *pixel = WHITE;
      }
   if (coarseness <= 0)        /* no need to coalesce */
      return;
   /* coalesce regions, based on coarseness, i.e. coarseness == 2
    * will close all horizontal/vertical gaps of 2 pixels */
   for (y=0, line=image+1 ; y < dy ; y++, line+=dx)
      {  /* coalesce horizontally */
      for (x=1, pixel=line ; x < dx-coarseness ; x++, pixel++)
         {
         if (*pixel == WHITE && *(pixel-1) == BLACK)
            {

            for (i = 1 ; i <= coarseness ; i++)
               {
               if (*(pixel+i))
                  {
                  for (i = 0 ; i < coarseness ; i++, pixel++)
                     *pixel = BLACK;
                  pixel-;
                  x += coarseness-1;
                  break;
                  }
               }
            }
         }
      }
   for (x=0, line=image+dx ; x < dx ; x++, line++)
      {  /* coalesce vertically */
      for (y=1, pixel=line ; y < dy-coarseness ; y++, pixel+=dx)
         {
         if (*pixel == WHITE && *(pixel-dx) == BLACK)
            {
            for (i=1, j=dx ; i <= coarseness ; i++, j+=dx)
               {
                  if (*(pixel+j))
                  {
                  for (i = 0 ; i < coarseness ; i++, pixel+=dx)
                     *pixel = BLACK;
                  pixel -= dx;
                  y += (coarseness-1);
                  break;
                  }
               }

            }
         }
      }
   }

/***************************************************
 * function: static void sequence_zones(unsigned char *image,int dx,int dy,int order)
 * Extract zones from buffer, place in zone_list, update zone_count and
 * sequence zones so they are in either COLUMN_MAJOR or ROW_MAJOR reading order.
 * parameters: pointer to image buffer and x/y extents of image in buffer
 *              order, COLUMN_MAJOR or ROW_MAJOR
 * returns: nothing
 ***********************************************************/
static void sequence_zones(unsigned char *image,int dx,int dy,int order)
   {
   int x,y,i,j,index,fudge_x,fudge_y;
   unsigned char *ptr;
   ZONE temp;

   for (y=0, zone_count=0 ; y < dy && zone_count < MAX_ZONES ; y+=MIN_Y_ZONE)
      {  /* extract zones from block images in y order */
      ptr = image + y * dx;
      for (x = 0 ; x < dx ; x += MIN_X_ZONE)
         if (*(ptr+x))                      /* found point */
            {
            if (zone_count >= MAX_ZONES)
               break;
            while (x > 0 && *(ptr+x-1))    /* back up to left side */
               x--;
            if (extract_zone (image,dx,dy,x,y,zone_list+zone_count))
               zone_count++;              /* get zone */
            }
      }
   if (zone_count == 0)
      {            /* if no zones, make the entire page one big zone */
      zone_list[0].x1 = zone_list[0].y1-0;
      zone_list[0].x2 = dx-1;
      zone-list[0].y2 = dy-1;
      zone_count = 1;
      return;
      }
   overlap_zones(zone_list,&zone_count);         /* remove overlap */
   if (order == COLUMN_MAJOR)
      {
      for (i = 0; i < zone_count-1 ; i++)
          {                                        /* sort on x1 */
          for (j = i+1 ; j < zone_count ; j++)
             {
             if (zone_list[j].x1 < zone_list[i].x1)
                {
                temp = zone_list[i];
                zone_list[i] = zone_list[j];
                zone_list[j] = temp;
                }
             }
          }
      for (i = 0; i < zone_count-1 ; i++)
          {         /* order zones left to right, up to down */
          x = zone_list[i].x2;
          y = zone_list[i].y1;
          fudge_x = zone_list[i].x1+dx/20;    /* 5% slippage on alignment */
          for (j = i+1, index = -1 ; j < zone_count ; j++)
             { /* find any zones above and within x extent of zone_list[i] */
             if (zone_list[j].x1 > x)
                break;
             if (zone_list[j].y1 < y && zone_list[j].x1 <= fudge_x)
                {
                x = zone_list[j].x2;
                y = zone_list[j].y1;
                index = j;
                }
             }
          if (index != -1)
             {
             temp = zone_list[i];
             zone_list[i] = zone_list[index];
             zone_list[index] = temp;
             }
          }
      }
   else        /* ROW_MAJOR */
      {   /* already sorted in y1 order */
      for (i = 0 ; i < zone count-1 ; i++)
          {           /* order zones up to down, left to right */
          y = zone_list[i].y2;
          x = zone_list[i].x1;
          fudge_y = zone_list[i].y1+dy/20;    /* 5% slippage on alignment */
          for (j = i+1, index = -1 ; j < zone_count ; j++)
             { /* find any zones left of and within y extent of zone_list[i] */
             if (zone_list[j].y1 > y)
                break;
             if (zone_list[j].x1 < x && zone_list[j].y1 <= fudge_y)
                {
                y = zone_list[j].y2;
                x = zone_list[j].x1;
                index = j;
                }
             }
          if (index != -1)
             {
             temp = zone_list[i];
             zone_list[i] = zone_list[index];
             zone_list[index] = temp;
             }
          }
      }
   }

/************************************************
 * function: static int extract_zone(unsigned char *image,int dx,int dy,int
 * x,int y,ZONE *zone_ptr)
 *  Extracts a rectangular region from the buffer starting at a point. It does
 *  this by walking the perimeter, recording the min and max x/y dimensions.
 *  If the region is big enough to be significant, it is saved as a zone and
 *  then whited out so it won't be reconsidered.
 * parameters: pointer to image buffer and dx/dy extents of that buffer.
 *             x & y point to start extracting the zone.
 *             pointer to storage for zone
 * returns: 1 if zone OK or 0 if not
 ************************************************/
static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE *zone_ptr)
   {
   int ix,iy,minx,miny,maxx,maxy; /* perimeter variables & min/max values */
   HEADING dir;                   /* current direction */
   unsigned char *previous,*next,*here;    /* buffer pointers */

   minx = maxx = ix = x;          /* preset min/max x/y and perimeter vars */
   miny = maxy = iy = y;
   dir = GOING_UP;                /* starting direction */
   do     /* walk perimeter, recording min/max of rectangular region */
      {
      if (ix < minx)              /* update min/max */
          minx = ix;
      if (ix > maxx)
          maxx = ix;
      if (iy < miny)
          miny = iy;
      if (iy > maxy)
          maxy = iy;
      here = image + iy * dx + ix;   /* where are we? */
      next = here + dx;
      previous = here - dx;
      switch (dir)                   /* based on current direction, */
          {                         /* look around for next direction */
          case GOING_UP:
          if (ix > 0 && *(here-1))
             {
             ix-;
             dir = GOING_LEFT;
             break;
             }
          if (iy > 0 && *previous)
             {
             iy;
             break;
             }
          if (ix < dx-1 && *(here+1))
             {
             ix++;
             dir = GOING_RIGHT;
             break;
             }
          if (iy < dy-1 && *next)
             {
             iy++;
             dir = GOING_DOWN;
             break;
             }
          break;
          case GOING_RIGHT:
          if (iy > 0 && *previous)
             {
             iy--;
             dir = GOING_UP;
             break;
             }
          if (ix < dx-1 && *(here+1))
             {
             ix++;
             break;
             }
          if (iy < dy-1 && *next)
             {
             iy++;
             dir = GOING_DOWN;
             break;
             }
          if (ix > 0 && *(here-1))
             {
             ix--;
             dir = GOING_LEFT;
             break;
             }
          break;
          case GOING_DOWN:
          if (ix < dx-1 && *(here+1))
             {
             ix++;
             dir = GOING_RIGHT;
             break;
             }
          if (iy < dy-1 && *next)
             {
             iy++;
             break;
             }
          if (ix > 0 && *(here-1))
             {
             ix--;
             dir = GOING_LEFT;
             break;
             }
          if (iy > 0 && *previous)
             {
             iy--;
             dir = GOING_UP;
             break;
             }
          break;
          case GOING_LEFT:
          if (iy < dy-1 && *next)
             {
             iy++;
             dir =- GOING_DOWN;
             break;
             }
          if (ix > 0 && *(here-1))
             {
             ix--;
             break;
             }
          if (iy > 0 && *previous)
             }
             iy--;
             dir = GOING_UP;
             break;
             }
          if {ix < dx-1 && *(here+1))
             {
             ix++;
             dir = GOING_RIGHT;
             break;
             }
          break;
          }
      } while (ix !=  iy != y);        /* until we return to the start */
   for (iy=miny, here=image+miny*dx+minx ; iy <= maxy ; iy++, here+=dx)
      memset(here,WHITE,maxx-minx+1);     /* whiteout the region */
   if ((maxx-minx+1 < MIN_X_ZONE)  (maxy-miny+1 < MIN_Y_ZONE))
      return 0;                            /* big enough? */
   if {minx > 0)            /* expand dimensions by one pixel */
      minx--;
   if (maxx < dx-1)
      maxx++;
   if (miny > 0)
      miny--;
   if (maxy < dy-1)
      maxy++;
   zone_ptr->x1 = minx;      /* save zone */
   zone_ptr->y1 = miny;
   zone_ptr->x2 = maxx;
   zone_ptr->y2 = maxy;
   return 1;
   }

/************************************************
 * function: static void overlap_zones{ZONE *zone_array,int *array_size)
 * find zones that overlap and combine them
 * parameters: pointer to array of zones and pointer to count of zones
 * returns: nothing
 ************************************************/
static void overlap_zones(ZONE *zone_array,int *array_size)
   {
   in* i,j,ax1,ay1,ax2,ay2,bx1,by1,bx2,by2;

   for (i = 0 ; i < *array_size-1 ; i++)
      {       /* compare each zone against the rest */
      ax1 = (zone_array+i)->x1;
      ay1 = (zone_array+i)->y1;
      ax2 = (zone_array+i)->x2;
      ay2 = (zone_array+i)->y2;
      for (j = i+1 ; j < *array_size ; j++)
          {
          bx1 = (zone_array+j)->x1;
          by1 = (zone_array+j)->y1;
          bx2 = (zone_array+j)->x2;
          by2 = (zone_array+j)->y2;
          if ((bx1>=ax2)  (ax1>=bx2)  (by1>=ay2)  (ay1>=by2))
             continue;                   /* no overlap */
          bx1 = min(bx1,ax1);             /* combine zones */
          by1 = min(by1,ay1);
          bx2 = max(bx2,ax2);
          by2 = max(by2,ay2);
          (zone_array+i)->x1 = bx1;
          (zone_array+i)->y1 = by1;
          (zone_array+i)->x2 = bx2;
          (zone_array+i)->y2 = by2;
          (*array_size)--;              /* new zone count */
          memmove((char *)(zone_array+j),(char *)(zone_array+j+1),
             sizeof(ZONE)*(*array_size-j));  /* shift array to remove zone */
          i = -1;                        /* start all over */
          break;
          }
      }
   }
/* End of File */