/* TroffFilter.c -- Copyright 1994, 1996 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* $Id: TroffFilter.c,v 1.8 2001/05/31 03:49:42 liam Exp $ */ /* Filter for nroff, troff, groff, sqtroff files. * See FilterMain and wordrules.h for more info. * * This might be better done by running nroff, at the expense of * a huge hit on performance. We would then have to filter out * hyphenation and page-breaks, though, which is even harder. * * The main things not done so far are: * filter comments * delete macro definitions * * The biggest TODO is to move words around. * I have more or less fixed this, see (4) below, but I'm * not really happy with the result yet. * * Let }{ mark a block boundary... * Suppose the input looks like * "An entry for \fIusername is..." * with a block boundary after the \f, thus: * "An entry for \f}{Iusername is..." * * Now, we currently turn this into * "An entry for \ username is..." * * When we index it, "username" is word 0 of the 2nd block. * But when we fetch a match, we'll see the unfiltered data, * and try to count words. The fIusername will look like a word, * so we will end up thinking it's the last word in the 1st block. * Hence we will highlight "is" when we fetch the match. * * Ways round this might include: * (1) running the filter when we retrieve matches * The difficulty here is that the filter would be passed a * tiny snippet of the input file and might not cope. * * (2) running the filter on the entire file to retrieve matches. * For large files, this will be a major performance problem. * Also, the filtered output isn't intended for displaying, so * we would need to track both of them. A single 45 MByte file * might thus need 90MBytes of memory and/or disk space. Oops. * * (3) moving the word backwards * generate "An entry for \username is..." instead, with the * spaces moved after the word, but the \ retained so that the * SAW_PUNCT_BEFORE flag was right. * * (4) a SUB_ONE flag * put a marker there that means, "index this word one char ahead * of where it really is". That's probably a performance hit in * lqaddfile, unfortunately. The flags would add, so we'd generate * "An entry for \FFusername is...". Well, maube this is better * than option 3, I'm not sure. * This is what I have done, LQT_CHAR_TO_SKIP. * * (5) run nroff on the input instead of this filter. * This would be a performance *nightmare*, unless I made a version * of nroff that was really fast, but that's a lot of work. * We'd then have the problem of dealing with page headers, footers, * hyphenation, and tables. Ugh. * */ #ifdef SYSV extern int _filbuf(), _flsbuf(); /* for lint! */ #endif #include "globals.h" #include "error.h" #include #include #include /* for liblqutil */ #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_UNISTD_H # include #endif #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "filter.h" /** C Library functions that need to be declared: **/ /** Functions in this file that need to be declared **/ #define PUTMODE_IGNORE 1 #define PUTMODE_PRINT 0 PRIVATE int LQFpReadOneCharacter( #ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *inputFile, char *fileName, FILE *OutputFile #endif ); /** **/ PRIVATE int InWord = 0; LIBRARY int LQF_Troff_Copy(db, InputFile, Name, OutputFile) t_LQTEXT_Database *db; FILE *InputFile; char *Name; FILE *OutputFile; { int ch; InWord = 0; while ((ch = LQFpReadOneCharacter(db, InputFile, Name, OutputFile)) != EOF) { if (ch != 0) { putc(ch, OutputFile); } } return 0; /* TODO: error return */ } #define OPEN_PAREN '(' PRIVATE void OutputChar(ch, Mode, OutputFile) int ch; int Mode; FILE *OutputFile; { if (!ch) { return; } if (Mode == PUTMODE_PRINT) { putc(ch, OutputFile); } else { if (isalnum(ch)) { putc(LQT_CHAR_TO_IGNORE, OutputFile); } else { putc(ch, OutputFile); } } } PRIVATE int doDelim(db, inputFile, Delim, Mode, fileName, OutputFile) t_LQTEXT_Database *db; FILE *inputFile; int Delim; int Mode; char *fileName; FILE *OutputFile; { int ch; putc(Delim, OutputFile); if (Delim == '[') { Delim = ']'; } while ((ch = LQFpReadOneCharacter(db, inputFile, fileName, OutputFile)) != EOF) { if (ch == Delim) { putc(Delim, OutputFile); return 0; } else if (ch == '\n') { return ch; } else if (ch != 0) { OutputChar(ch, Mode, OutputFile); } } return EOF; } PRIVATE int doThingWithDelim(db, ch, inputFile, fileName, OutputFile) t_LQTEXT_Database *db; int ch; FILE *inputFile; char *fileName; FILE *OutputFile; { putc(ch, OutputFile); if ((ch= getc(inputFile)) == EOF) { return EOF; } if (ch == OPEN_PAREN) { return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName, OutputFile); } else { return doDelim(db, inputFile, ch, PUTMODE_PRINT, fileName, OutputFile); } } PRIVATE int doThingWithName(db, inputFile, fileName, OutputFile) t_LQTEXT_Database *db; FILE *inputFile; char *fileName; FILE *OutputFile; { int ch, ch2, ch3; if ((ch = getc(inputFile)) == EOF) { return EOF; } if (ch == OPEN_PAREN) { putc(OPEN_PAREN, OutputFile); /* \*(xxStuff * Four cases, where w is a wordchar, x isn't * "i" represents char-to-ignore * 1 \*(wwStuff --> \*(xxStuff * 2 \*(xwStuff --> \*(x Stuff * 3 \*(wxStuff --> \*(ixStuff * 4 \*(xxStuff --> \*(xxStuff * ^ch (w or x, so to speak) * ^ch2 (x or w...) * ^ch3 (S here) * * If the sequence is not followed by a word char, * we simply output it in ignore mode. * */ ch = getc(inputFile); /* first char after open paren */ if (ch == EOF) { return EOF; } ch2 = getc(inputFile); /* second char after open paren */ if (ch2 == EOF) { return EOF; } ch3 = getc(inputFile); /* third char after open paren */ if (ch3 == EOF) { return EOF; } /* if it's not followed by a word, easy: */ if (!LQT_StartsWord(db, ch3) && !isdigit(ch3) ) { OutputChar(ch, PUTMODE_IGNORE, OutputFile); OutputChar(ch2, PUTMODE_IGNORE, OutputFile); (void) ungetc(ch3, inputFile); return 0; } /* now handle the 4 cases */ if (LQT_StartsWord(db, ch2) || isdigit(ch2)) { /* case 1 or 2 */ if (LQT_StartsWord(db, ch) || isdigit(ch)) { /* case 1, ww */ putc(LQT_CHAR_TO_SKIP, OutputFile); putc(LQT_CHAR_TO_SKIP, OutputFile); } else { /* case 2, xw */ OutputChar(ch, PUTMODE_IGNORE, OutputFile); putc(LQT_CHAR_TO_SKIP, OutputFile); } } else { /* cases 3 and 4 have the same action */ /* case 3, wx */ /* case 4, xx */ OutputChar(ch, PUTMODE_IGNORE, OutputFile); OutputChar(ch2, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch3, inputFile); return 0; } else if (ch == '[') { return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName, OutputFile); } else { /*CANTHAPPEN*/ putc(' ', OutputFile); } return 0; } PRIVATE int LQFpReadOneCharacter(db, inputFile, fileName, OutputFile) t_LQTEXT_Database *db; FILE *inputFile; char *fileName; FILE *OutputFile; { int ch, ch1, ch2, ch3; while ((ch = getc(inputFile)) != EOF) { if (ch == '\\') { switch ((ch = getc(inputFile))) { case EOF: return EOF; case '\\': putc('\\', OutputFile); putc('\\', OutputFile); break; case '\n': /* \ at the end of the line joins the lines together */ putc('\\', OutputFile); putc(ch, OutputFile); break; /* TODO: join the lines together and adjust the characters * so that we don't move the start of any words. * That's a little tricky. */ case '[': putc('\\', OutputFile); return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName,OutputFile); case OPEN_PAREN: putc('\\', OutputFile); (void) ungetc(OPEN_PAREN, inputFile); doThingWithName(db, inputFile, fileName, OutputFile); break; case '_': putc('\\', OutputFile); OutputChar(ch, PUTMODE_IGNORE, OutputFile); return 0; case '^': case '`': case '{': case '|': case '}': case ' ': case '+': case '~': case '#': /* sqtroff and UCB ditroff only */ putc('\\', OutputFile); putc(ch, OutputFile); break; /* unknown escape characters: */ case 'C': case 'E': case 'F': case 'G': case 'i': case 'j': case 'J': case 'I': /* Immediate evaluation */ case 'K': case 'm': case 'M': case 'N': case 'O': case 'P': case 'q': case 'R': case 'U': case 'V': case 'W': case 'y': case 'Y': default: putc('\\', OutputFile); /* TODO swallow the \ */ putc(ch, OutputFile); break; /* self-contained escapes of the form "\c" */ case 'a': case 'A': case 'c': case 'd': case 'e': case 'p': case 'r': case 't': case 'u': case 'z': case '0': putc('\\', OutputFile); { ch2 = getc(inputFile); if (ch2 == EOF) { /* drop it */ return EOF; } if (isalnum(ch2)) { putc(LQT_CHAR_TO_SKIP, OutputFile); putc(ch2, OutputFile); return 0; } if (isspace(ch2)) { (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); putc(ch2, OutputFile); return 0; } putc(LQT_CHAR_TO_IGNORE, OutputFile); ungetc(ch2, inputFile); } return 0; /* escapes with an argument, \c'value' or \c[value] */ case 'B': case 'b': case 'D': case 'H': case 'h': case 'l': case 'L': case 'o': case 'S': /* slant */ case 'T': /* what this?? */ case 'v': case 'w': case 'x': case 'X': putc('\\', OutputFile); return doThingWithDelim( db, ch, inputFile, fileName, OutputFile ); /* escapes with a name, \cx or \c[xxxx] or \c(xx */ case '*': case 'Q': /* \Q is for sqtroff only, reads a qonfig variable */ case 'f': case 'g': case 'k': putc('\\', OutputFile); if ((ch1 = getc(inputFile)) == EOF) return EOF; if (ch1 == OPEN_PAREN || ch1 == '[') { (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); /* f */ (void) ungetc(ch1, inputFile); return doThingWithName(db, inputFile, fileName, OutputFile); } else { /* e.g. \fR */ if ((ch2 = getc(inputFile)) == EOF) return EOF; /* turn "\fR\\$1" into "\xx" * where x is char_to_ignore, * so that word counting is preserved, * but * \fRboy * turns into * "\ boy" * since the "boy" will be counted as a word */ if (LQT_StartsWord(db, ch2) || isdigit(ch2) ){ /* \fRboy */ if (LQT_StartsWord(db, ch1) || LQT_OnlyWithinWord(db, ch1) ) { putc(LQT_CHAR_TO_SKIP, OutputFile); /* f */ putc(LQT_CHAR_TO_SKIP, OutputFile); /* font name */ } else { (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile); } } else { /* \fR... */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch2, inputFile); return 0; } /* special cases: */ case 'n': /* number register, \nx */ putc('\\', OutputFile); if ((ch2 = getc(inputFile)) == EOF) return EOF; if (ch2 == '+' || ch2 == '-') { (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); ch = 0; putc(ch2, OutputFile); if ((ch2 = getc(inputFile)) == EOF) return EOF; } /* cases: * 1 \n(xx -> handle name * 2 \nwX --> \iiX * 3 \nxX --> \ixX * 4 \nwW --> \ W * 5 \nxW --> \ixW * */ if (ch2 == OPEN_PAREN || ch2 == '[') { /* case 1: put out the 'n' and handle the name */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) ungetc(ch2, inputFile); return doThingWithName(db, inputFile, fileName, OutputFile); } /* read the character after the sequence */ if ((ch3 = getc(inputFile)) == EOF) return EOF; if (!LQT_StartsWord(db, ch3) && !isdigit(ch3)) { /* case 2 or 3 */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile); (void) ungetc(ch3, inputFile); return 0; } /* case 4 or 5 */ if (LQT_StartsWord(db, ch2) || isdigit(ch2)) { /* case 4 */ putc(LQT_CHAR_TO_SKIP, OutputFile); (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile); } else { (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch3, inputFile); return 0; case 's': /* \s[+-][expr], \s[+-](NN, \sNN, \s[+-]N */ /* \s -- set size * N is 0 or [123][0-9] or [4-9] * NN is >= 40 * */ putc('\\', OutputFile); ch1 = getc(inputFile); if (ch1 == EOF) { return EOF; } if (ch1 == '+' || ch1 == '-') { /* \s+9 or \s+(32 */ putc(LQT_CHAR_TO_IGNORE, OutputFile); /* the "s" */ putc(ch1, OutputFile); /* the + or - */ /* so we have handled the \ s and + now */ ch = ch1; /* save for error message below */ if ((ch1 = getc(inputFile)) == EOF) { return EOF; } if (ch1 == OPEN_PAREN || ch1 == '[') { (void) ungetc(ch1, inputFile); return doThingWithName(db, inputFile, fileName, OutputFile ); } if (!isdigit(ch1)) { if (ch1 == '\\') { /* e.g. \s+\nx */ (void) ungetc(ch1, inputFile); return 0; } Error(E_WARN, "found \\s%c%c unexpectedly!", ch, ch1 /* why we needed to save ch */ ); /* but continue */ } /* cases: * 1: \s+6w -> \i+ w * 2: \s+6x -> \i+ix */ ch2 = getc(inputFile); if (ch2 == EOF) { return EOF; } if (LQT_StartsWord(db, ch2) || isdigit(ch2)) { /* 1: \s+6w -> \i+ w */ putc(LQT_CHAR_TO_SKIP, OutputFile); } else { /* 2: \s+6x -> \i+ix */ (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch2, inputFile); return 0; } if (ch1 == OPEN_PAREN || ch1 == '[') { /* put out the 's' and handle the name */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) ungetc(ch1, inputFile); return doThingWithName(db, inputFile, fileName, OutputFile); } /* OK, so it's \s36 or \s9 or \s0 * It could also be \s\*x, but we cannot handle that, * and it's extremely rare */ /* cases: * 1: \s0W --> \ W (where 0 can be 0 5 6 7 or 9) * 2: \s0X --> \i0X * 3: \s12W --> \ W * 4: \s12X --> \i12X */ ch2 = getc(inputFile); if (ch2 == EOF) { return EOF; } if (ch1 == '0' || (ch1 >= '5' && ch1 <= '9')) { /* case 1 or 2 */ if (LQT_StartsWord(db, ch2) || isdigit(ch2) ) { /* case 1 */ if (ch) { /* the s */ /* note: \s+3x -> \i+ x; the + was handled above * and the s already ignored correctly */ putc(LQT_CHAR_TO_SKIP, OutputFile); } putc(LQT_CHAR_TO_SKIP, OutputFile); /* the digit */ } else { /* 2: \s0X --> \i0X */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch2, inputFile); return 0; } if (!isdigit(ch2)) { Error(E_WARN, "found %c after \\s%c unexpectedly!", ch2, ch1 ); /* might be \s\*x */ (void) ungetc(ch2, inputFile); return 0; } ch3 = getc(inputFile); if (ch3 == EOF) { return EOF; } if (LQT_StartsWord(db, ch3) || isdigit(ch3)) { /* 3: \s12W --> \ W */ if (ch) { /* the s */ /* note: \s+3x -> \i+ x; the + was handled above * and the s already ignored correctly */ putc(LQT_CHAR_TO_SKIP, OutputFile); } putc(LQT_CHAR_TO_SKIP, OutputFile); /* digit */ putc(LQT_CHAR_TO_SKIP, OutputFile); /* digit */ } else { /* 3: \s12X --> \i12X */ (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); /* s */ (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile); (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile); } (void) ungetc(ch3, inputFile); return 0; } } else { if (ch) { return ch; } else { /* 0 return, so read more */ return LQFpReadOneCharacter(db, inputFile, fileName, OutputFile); } } } /*NOTREACHED*/ if (ch == EOF) { return EOF; } else { return ch; } }