/* Date of this version: 9/11/2000 LOCR - Optical Character Recognition Version: 0.1.0 Copyright (C) 2000 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define PROGNAME "LOCR" #define VER "0.1.0" #define COPYRIGHT "by Miguel A. Lerma Copyright (C) 2000" /* Examples: - Recognize characters in "document.pnm" using data from "foo.dat" (output goes to "document.out"): ./locr -d foo.dat document.pnm document.out - Generate data file "foo.dat" from text file "foo.txt" and image file "foo.pnm": ./locr -gt foo.txt foo.pnm foo.dat - Same, but also merge data from "foo1.dat": ./locr -g -d foo1.dat -t foo.txt foo.pnm foo.dat - Merge data from "foo1.dat" and "foo2.dat" into "foo3.dat": ./locr -g -d foo1.dat -d foo2.dat > foo3.dat */ #include #include #include #define MAX(_a,_b) ((_a)<(_b)?(_b):(_a)) #define MIN(_a,_b) ((_a)<(_b)?(_a):(_b)) #define ABS(_a) ((_a)<0?-(_a):(_a)) #define TWO_BYTES_TYPE short int #define IMAG_TYPE TWO_BYTES_TYPE #define MAX_SCORE 65536 #define MAX_ACCEPTABLE_SCORE 128 #define MAX_IMAGSIZE 16777216 #define MAX_NBLOCKS 32766 #define MAX_NLINES 1000 #define MAX_NCOLS 16 /* fields in table of blocks */ #define TBLK_NFLDS 8 /* number of fields */ #define I1 0 /* i1 = i-coordinate of upper-left corner */ #define J1 1 /* j1 = j-coordinate of upper-left corner */ #define I2 2 /* i2 = i-coordinate of lower-right corner + 1 */ #define J2 3 /* j2 = j-coordinate of lower-right corner + 1 */ #define COL 4 /* column */ #define LN 5 /* line */ #define NH 6 /* number of holes */ #define HI 7 /* hi = i-coordinate of hot spot */ /* fields in table of lines */ #define TLN_NFLDS 6 /* number of fields */ /* (fields 0-4 already defined above) #define I1 0 i1 = i-coordinate of upper-left corner #define J1 1 j1 = j-coordinate of upper-left corner #define I2 2 i2 = i-coordinate of lower-right corner + 1 #define J2 3 j2 = j-coordinate of lower-right corner + 1 #define COL 4 column */ #define FB 5 /* first block in sorted table */ /* number of pixels two consecutive lines are allowed to overlap */ #define LINE_OVERLAPPING 2 /* fields in table of columns */ #define TCOL_NFLDS 6 /* number of fields */ /* (fields 0-3,5 already defined above) #define I1 0 i1 = i-coordinate of upper-left corner #define J1 1 j1 = j-coordinate of upper-left corner #define I2 2 i2 = i-coordinate of lower-right corner + 1 */ #define FL 4 /* first line #define FB 5 first block in sorted table */ #define MAX_CHAR_WIDTH_FACTOR 10 #define MAX_CHAR_HEIGHT_FACTOR 10 #define MAX_CHAR_HEIGHT_WIDTH_RELATION 20 #define MAX_CHAR_WIDTH_HEIGHT_RELATION 30 #define STD_WIDTH 16 /* standard size of a character */ #define STD_HEIGHT 16 /* fields in table of attributes */ #define NATTRIBS 8 /* number of attributes */ /* (fields 6, 8 already defined above) */ #define NSB 0 /* number of blocks (minus one) */ #define HW 1 /* STD_HEIGHT*char_hight/char_width */ #define BVPOS 2 /* bottom position respect to baseline */ #define RVPOS 3 /* top position respect to roofline */ #define TVPOS 4 /* top position respect to top of line */ #define NUS 5 /* not used #define NH 6 number of holes */ #define HOT 7 /* hot spot position respect to baseline */ /* maximum number of characters in text file */ #define MAX_NCHARS 2000 /* global variables */ char *program_name; /* name of this program */ IMAG_TYPE *imag; int width, height, imagsize; /* size of the image */ /* tables of data */ unsigned short int nchars=0; /* total number of chars in data table */ unsigned short int dfnchars=0; /* number of chars from datafile */ unsigned char data_ascii[MAX_NCHARS]; TWO_BYTES_TYPE data_std_size_chars[MAX_NCHARS][STD_HEIGHT]; TWO_BYTES_TYPE data_attr[MAX_NCHARS][NATTRIBS]; /* table of blocks */ int tblk[MAX_NBLOCKS+1][TBLK_NFLDS]; int lastblock=MAX_NBLOCKS-1; /* last block (in the table of blocks) */ int stblk[MAX_NBLOCKS]; /* sorted table of blocks (actually a table of pointers to the real table) */ int num_of_blocks; /* number of blocks */ int ablkw, ablkh; /* average dimensions of a block */ /* table of lines */ int tln[MAX_NLINES+1][TLN_NFLDS]; int num_of_lines=0; /* number of lines */ int alnw, alnh; /* average dimensions of a line */ /* table of columns */ int tcol[MAX_NCOLS+1][TLN_NFLDS]; int num_of_cols=0; /* number of columns */ int top_i, bottom_i; /* edges of text */ /* table of attributes */ TWO_BYTES_TYPE tattr[MAX_NBLOCKS+1][NATTRIBS]; /* options */ int single_column=0; int low_ascii=0; int align_lines=0; int generate_data=0; int use_datafile=0; int use_textfile=0; int recog_level=9; int showdata=0; /* functions */ int usage_and_exit(); int getsampletext(); int getdata(); int getimag(); void process_image(); void cleanborder(); void remove_isolated_pixels(); void alignlines(); int getblocks(); int getcolumns(); int getlines(); void average_all_blocks(); void average_blocks(); int compactify_tblk(); int remove_atypical_blocks(); int clear_atypical_blocks(); void move_block(); void remove_block(); void clear_block(); void join_block(); void permute_block(); int convert_to_ascii(); int mk_std_size(); int mk_std_size_bin(); void put_std_size_char(); void dump_std_size_char(); void dump_data_ascii(); void dump_data_std_size_chars(); void dump_data_attr(); int fput_twobytes(); int fget_twobytes(); char dummy(); char recognize_char(); int compare_pos(); int hamming_dist(); /* functions mainly for debugging purposes */ char show_char(); char show_std_size_char(); char show_std_size_char_bin(); void show_std_size_bk_bin(); void show_tblk(); void show_stblk(); void show_tln(); void show_tcol(); void show_data_ascii(); void show_data_std_size_chars(); void show_data_attr(); void show_data(); void show_data2(); void dumppic(); void cleanpic(); void display_blocks(); /* obsolete code */ void filter(); int getlines_old(); int hamming_dist2(); /********************* main function *********************/ int main(int argc, char *argv[]) { int nfiles=0; FILE *fi=stdin, *fo=stdout, *fd, *ft; char *fni, *fno, *fnd, *fnt; int k, n, nn; /* miscellaneous variables */ char c, msg[80]; /* scan arguments */ program_name=argv[0]; fnd = calloc (sizeof(char),strlen(program_name) + 5); strcpy(fnd,program_name); strcat(fnd,".dat"); /* default data file name */ for (n=1; n dfnchars) { process_image(fi); nn = convert_to_ascii(fo,dummy); if (showdata) { show_data2(); exit(0); } if (nn != nchars) { fprintf(stderr, "wrong number of characters in scanned text: %d/%d\n", nn,nchars); exit(1); } } /* structure of the data file */ fput_twobytes(nchars,fo); /* number of characters */ dump_data_ascii(fo); /* ascii codes of those characters */ dump_data_std_size_chars(fo); /* chars at their standard size */ dump_data_attr(fo); /* attributes */ } else { if (showdata) { show_data2(); exit(0); } process_image(fi); convert_to_ascii(fo,recognize_char); } fclose(fo); return(0); } /********************* other functions *********************/ int usage_and_exit(msg) char *msg; { if (msg[0]!='\0') fprintf(stderr,"Error: %s\n",msg); printf("Usage:\n%s",program_name); printf(" [-aghlms] [ -d ... ] [ -t ]"); printf(" [ []]\n"); exit(0); return(0); } int getsampletext(f) FILE *f; { int ic; while (((ic=fgetc(f)) != EOF) && (nchars < MAX_NCHARS)) { while ((ic == ' ') || (ic == '\n') || (ic == '\r') || (ic == '\t')) ic=fgetc(f); if (ic != EOF) { data_ascii[nchars++] = ic; } } if (nchars>=MAX_NCHARS) { fprintf(stderr,"sample text file is too large\n"); } return(0); } int getdata(f) FILE *f; { int n0, n, r, ic; n0=dfnchars; if ((ic=fget_twobytes(f))==EOF) { fprintf(stderr,"datafile is too short\n"); exit(1); } else { dfnchars+=ic; if (dfnchars > MAX_NCHARS) { fprintf(stderr,"datafile is too large\n"); exit(1); } } for (n=n0; n MAX_IMAGSIZE) { height = MAX_IMAGSIZE/width; } imagsize = width*height; imag = calloc (sizeof(IMAG_TYPE),imagsize); if (strcmp(magic,"P1") == 0) { ungetc(' ',f); for (k=0; k=0; l--) { imag[k+l] = (0x1 & ic); ic = (ic >> 1); } } } else { fprintf(stderr,"File format not recognized\n"); fclose(f); exit(1); } return(0); } void process_image(f) FILE * f; { getimage(f); fclose(f); /* clean borders of image */ cleanborder(0,width,height,imag); /* remove dust and snow */ remove_isolated_pixels(width,height,imag); /* align lines */ if (align_lines) alignlines(); /* find blocks (connected regions of pixels and boxes enclosing them) */ getblocks(); average_all_blocks(); if (! generate_data) { clear_atypical_blocks(); /* this is to remove figures */ average_all_blocks(); /* reaverage blocks */ } /* find columns */ getcolumns(); /* find lines */ getlines(); } void cleanborder(n,w,h,p) IMAG_TYPE n; int w, h; /* size of rectangle */ IMAG_TYPE * p; /* pointer to rectangle */ { int i, j; for (i=0, j=w*(h-1); i=width) count++; } if (count > max_count) { max_count=count; opt_n=n; } } /* rotate picture */ if (opt_n>0) { for (j=0; j=-opt_n; i--) { imag[width*i+j] = imag[width*(i+(opt_n*j)/width)+j]; } } } } int getblocks() { int i, j, k, l, n, m, occ, t; /* clean table of blocks */ for (i=0; i<=MAX_NBLOCKS; i++) { for(j=0; j 0) { m = imag[k+j]; if (m == 1) { /* pixel occupied but not assigned yet */ /* create a new entry in the table of blocks */ while ((tblk[n][I1] != 0) && (n= MAX_NBLOCKS) { n=2; while (tblk[n][I1] != 0) n++; } if (n >= MAX_NBLOCKS) { break; } m=n; imag[k+j]=m; tblk[m][I1]=i; tblk[m][J1]=j; tblk[m][I2]=i+1; tblk[m][J2]=j+1; } l=k+j-1; /* left pixel */ if (imag[l] > 1) { if (imag[l] != m) { join_block(m,imag[l]); m=imag[l]; } } l-=width; /* upper left pixel */ if (imag[l] > 1) { if (imag[l] != m) { join_block(m,imag[l]); m=imag[l]; } } l++; /* upper pixel */ if (imag[l] > 1) { occ=1; /* occupied (needed for detecting hole) */ if (imag[l] != m) { join_block(m,imag[l]); m=imag[l]; } } else { occ=0; /* not occupied */ } l++; /* upper right pixel */ if (imag[l] > 1) { if (imag[l] != m) { join_block(m,imag[l]); m=imag[l]; } else if (occ==0) { tblk[m][NH]++; /* hole detected */ tblk[m][HI]=i; /* i-coordinate of hot spot */ } } } } } /* compactify table of blocks */ lastblock=MAX_NBLOCKS-1; t = compactify_tblk(); return(t); } int getcolumns() { int col, j, k, n, m, s, ss, th, lth, hth, w, mrg; int *p, *q; /* find top and bottom edges of text */ k=0; while ((imag[k] == 0) && (k < imagsize)) k++; top_i=k/width; k=imagsize-1; while ((imag[k] == 0) && (k > 0)) k--; bottom_i=k/width; if (single_column==1) { tcol[0][I1]=1; tcol[0][J1]=1; tcol[0][I2]=height-1; tcol[0][J2]=width-1; num_of_cols=1; } else { /* find columns */ p = calloc(sizeof(int),width); q = calloc(sizeof(int),width); /* add pixels vertically (p contains the vertical totals) */ ss=0; for (j=0; j0) { s++; ss++; } } p[j]=s; } /* blur results (q contains the blurred totals) */ mrg=3*ablkw; /* how much to blur */ w=width-mrg; for (j=0; j th) && (j < width)) j++; tcol[num_of_cols][I2]=height-1; tcol[num_of_cols][J2]=j; } free(p); free(q); } tcol[num_of_cols][I1]=1; tcol[num_of_cols][J1]=width; tcol[num_of_cols][I2]=height-1; tcol[num_of_cols][J1]=width; /* assign columns to blocks and sort by columns */ m=0; for (col=0; col= tcol[col][J1]) { if (tblk[n][J1] < tcol[col+1][J1]) { tblk[n][COL]=col; stblk[m++]=n; } } } } tcol[col][FB]=m; num_of_blocks=m; for (n=2; n<=lastblock; n++) { /* assign columns to blocks */ k=num_of_cols-1; while ((tblk[n][J1] < tcol[k][J1]) && (k >= 0)) k--; tblk[n][COL]=k; } return(num_of_cols); } int getlines() { int col, k, k1, k2, l, n, n1, n2, t; int min_line_height; int line=0; min_line_height=ablkh; /* clean table of lines */ for (l=0; l tblk[stblk[k2]][I1]) { t=stblk[k1]; stblk[k1]=stblk[k2]; stblk[k2]=t; } } } /* find lines in current column (detect reasonably large jumps in i-coordinates of blocks) */ tln[line][I1]=tblk[stblk[n1]][I1]; tln[line][J1]=tcol[col][J1]; tln[line][I2]=tblk[stblk[n1]][I1]+min_line_height; tln[line][J2]=tcol[col][J1]; tln[line][COL]=col; tln[line][FB]=n1; for(n=n1; n tln[line][I2]) { line++; if (line+1 >= MAX_NLINES) { fprintf(stderr,"Too many lines\n"); exit(1); } tln[line][I1]=tblk[stblk[n]][I1]; tln[line][J1]=tcol[col][J1]; tln[line][I2]=tblk[stblk[n]][I1]+min_line_height; tln[line][J2]=tcol[col][J1]; tln[line][COL]=col; tln[line][FB]=n; } if (tln[line][I2] < tblk[stblk[n]][I2]) { tln[line][I2]=tblk[stblk[n]][I2]; } if (tln[line][J2] < tblk[stblk[n]][J2]) { tln[line][J2]=tblk[stblk[n]][J2]; } tblk[stblk[n]][LN]=line; } line++; } tln[line][FB]=num_of_blocks; tcol[col][FL]=line; num_of_lines=line; /* sort blocks in each line by j-coordinate */ for (l=0; l tblk[stblk[k2]][J1]) { t=stblk[k1]; stblk[k1]=stblk[k2]; stblk[k2]=t; } } } } return(num_of_lines); } int compactify_tblk() { int n, m; n=2; m=lastblock; while (n ablkw*MAX_CHAR_WIDTH_FACTOR) || (tblk[n][J2]-tblk[n][J1] > ablkh*MAX_CHAR_HEIGHT_FACTOR)) { remove_block(n); } } compactify_tblk(); return(0); } int clear_atypical_blocks() /* remove atypical blocks and all blocks inside */ { int n, w, h; for (n=2; n<=lastblock; n++) { if (tblk[n][I1]==0) continue; h = tblk[n][I2]-tblk[n][I1]; w = tblk[n][J2]-tblk[n][J1]; if ((h > ablkh*MAX_CHAR_HEIGHT_FACTOR) || (w > ablkw*MAX_CHAR_WIDTH_FACTOR) || (h > w*MAX_CHAR_HEIGHT_WIDTH_RELATION) || (w > h*MAX_CHAR_WIDTH_HEIGHT_RELATION)) { clear_block(n); } } compactify_tblk(); return(0); } void remove_block(n) /* remove block n from the table of blocks */ int n; { int i, j, k, i1, j1, i2, j2; i1=tblk[n][I1]; j1=tblk[n][J1]; i2=tblk[n][I2]; j2=tblk[n][J2]; for (i=i1; i 0) { clear_block(imag[k+j]); } } } for (k=0; k 0) { tblk[n][NH]+=tblk[m][NH]; tblk[n][HI]=MAX(tblk[n][HI],tblk[m][HI]); } for (k=0; k0) { tattr[m][HOT] = STD_HEIGHT*(tblk[m][I2]-tblk[m][HI]) /(tblk[m][I2] - tblk[m][I1]); } else { tattr[m][HOT] = 0; } if (generate_data) { /* copy attributes */ for (k=0; k> 8),f); /* first high */ fputc(n,f); /* then low */ return(0); } int fget_twobytes(f) FILE * f; { int high, low; if ((high=fgetc(f))==EOF) return(EOF); if ((low=fgetc(f))==EOF) return(EOF); return((high << 8) | low); } char dummy() /* dummy recognition function */ { return(0); } char recognize_char(n,level) int n, level; { int m, nn; int t, t1, t2; int ham=0; int score, best_nn, best_score; TWO_BYTES_TYPE bbk[STD_HEIGHT]; if (level < 0) return('?'); m=stblk[n]; mk_std_size_bin(m,bbk); best_nn = 0; best_score = MAX_SCORE; for (nn=0; nn126) || (data_ascii[nn]<33)) continue; } if (level >= 1) { if (data_attr[nn][NSB] != tattr[m][NSB]) continue; } if (level >= 2) { if (data_attr[nn][NH] != tattr[m][NH]) continue; else if (level >= 3) { t = data_attr[nn][HOT] - tattr[m][HOT]; if (ABS(t) > 2) continue; } } if (level >= 4) { if (compare_pos(data_attr[nn][BVPOS],tattr[m][BVPOS])) continue; } if (level >= 9) { if (compare_pos(data_attr[nn][RVPOS],tattr[m][RVPOS])) continue; } if (level >= 6) { if (compare_pos(data_attr[nn][TVPOS],tattr[m][TVPOS])) continue; } if (level >= 8) { t1 = tattr[m][HW]; t1 = MAX(t1,4); t1 = MIN(t1,96); t2 = data_attr[nn][HW]; t2 = MAX(t2,4); t2 = MIN(t2,96); if (4*t1 < 3*t2) continue; if (3*t1 > 4*t2) continue; } ham = hamming_dist(bbk,data_std_size_chars[nn]); score = ham; if (score < best_score) { best_nn=nn; best_score=score; } } /* this piece of code shows in real time how the characters are recognized (only for debugging purposes) */ /* printf("%c %d %d\n",data_ascii[best_nn],best_score,level); for (k=0; k 1) && (pos2 > 1)) return(0); if ((pos1 < -1) && (pos2 < -1)) return(0); return(1); } int hamming_dist(bbk1,bbk2) /* compute Hamming distance */ TWO_BYTES_TYPE bbk1[STD_HEIGHT]; /* between two characters */ TWO_BYTES_TYPE bbk2[STD_HEIGHT]; { int a, b, r, s; s=0; for (a=0; a> 1); } } return(s); /* return number of pixels where they differ */ } /************** functions mainly for debugging purposes *************/ char show_char(n) int n; { int i, i1, i2, j, j1, j2, k, m; m=stblk[n]; if (tblk[m][I1] == 0) return(0); /* if block is empty return no character */ i1=tblk[m][I1]; j1=tblk[m][J1]; i2=tblk[m][I2]; j2=tblk[m][J2]; printf("\n"); for (i=i1; i=0; j--) { if ((r >> j) & 0x1) { printf("O "); } else { printf(". "); } } printf("\n"); } printf("\n"); } void show_tblk() { int n, k; for (n=0; n<=lastblock; n++) { printf("%-5d ",n); for(k=0; k=0; b--) { if ((r>>b)&0x1) { printf("O "); } else { printf(". "); } } printf("\n"); } printf("\n"); } printf("\n"); } void show_data_attr() { int k1, k2; for (k1=0; k1=0; b--) { if ((r>>b)&0x1) { printf("O "); } else { printf(". "); } } printf("\n"); } printf("\n"); printf("%-5d ",n); for (k=0; kmin_line_height)) { tln[line][I2]=i; in_line=0; line++; } } /* assign lines to blocks */ n1=tcol[col][FB]; /* first block in this column */ n2=tcol[col+1][FB]; /* first block in next column */ for (n=n1; n tln[l][I2]) && (l tblk[stblk[k2]][J1]) { t=stblk[k1]; stblk[k1]=stblk[k2]; stblk[k2]=t; } } } } return(num_of_lines); } int hamming_dist2(bbk1,bbk2) /* best Hamming distance between slightly displaced characters */ TWO_BYTES_TYPE bbk1[STD_HEIGHT]; TWO_BYTES_TYPE bbk2[STD_HEIGHT]; { int i, a, b, r, s, bs; bs = 65536; for (i=-1; i<=1; i++) { s=0; for (a=MAX(0,-i); a> 1); } } bs=MIN(s,bs); } return(bs); }