/* TroffFilter.c -- Copyright 1994, 1996 Liam R. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* $Id: TroffFilter.c,v 1.8 2001/05/31 03:49:42 liam Exp $
 */

/* Filter for nroff, troff, groff, sqtroff files.
 * See FilterMain and wordrules.h for more info.
 *
 * This might be better done by running nroff, at the expense of
 * a huge hit on performance.  We would then have to filter out
 * hyphenation and page-breaks, though, which is even harder.
 *
 * The main things not done so far are:
 * filter comments
 * delete macro definitions
 *
 * The biggest TODO is to move words around.
 * I have more or less fixed this, see (4) below, but I'm
 * not really happy with the result yet.
 *
 * Let }{ mark a block boundary...
 * Suppose the input looks like
 * "An entry for \fIusername is..."
 * with a block boundary after the \f, thus:
 * "An entry for \f}{Iusername is..."
 *
 * Now, we currently turn this into
 * "An entry for \  username is..."
 * 
 * When we index it, "username" is word 0 of the 2nd block.
 * But when we fetch a match, we'll see the unfiltered data,
 * and try to count words.  The fIusername will look like a word,
 * so we will end up thinking it's the last word in the 1st block.
 * Hence we will highlight "is" when we fetch the match.
 *
 * Ways round this might include:
 * (1) running the filter when we retrieve matches
 *     The difficulty here is that the filter would be passed a
 *     tiny snippet of the input file and might not cope.
 *
 * (2) running the filter on the entire file to retrieve matches.
 *     For large files, this will be a major performance problem.
 *     Also, the filtered output isn't intended for displaying, so
 *     we would need to track both of them.  A single 45 MByte file
 *     might thus need 90MBytes of memory and/or disk space.  Oops.
 *
 * (3) moving the word backwards
 *     generate "An entry for \username   is..." instead, with the
 *     spaces moved after the word, but the \ retained so that the
 *     SAW_PUNCT_BEFORE flag was right.
 *
 * (4) a SUB_ONE flag
 *     put a marker there that means, "index this word one char ahead
 *     of where it really is".  That's probably a performance hit in
 *     lqaddfile, unfortunately.  The flags would add, so we'd generate
 *     "An entry for \FFusername is...".  Well, maube this is better
 *     than option 3, I'm not sure.
 *     This is what I have done, LQT_CHAR_TO_SKIP.
 *
 * (5) run nroff on the input instead of this filter.
 *     This would be a performance *nightmare*, unless I made a version
 *     of nroff that was really fast, but that's a lot of work.
 *     We'd then have the problem of dealing with page headers, footers,
 *     hyphenation, and tables.  Ugh.
 * 
 */

#ifdef SYSV
 extern int _filbuf(), _flsbuf(); /* for lint! */
#endif

#include "globals.h"
#include "error.h"

#include <stdio.h>
#include <ctype.h>
#include <sys/types.h> /* for liblqutil */

#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif

#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#else
# include <malloc.h>
#endif

#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif

#include "wordrules.h"
#include "emalloc.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "filter.h"

/** C Library functions that need to be declared: **/
/** Functions in this file that need to be declared **/

#define PUTMODE_IGNORE	1
#define PUTMODE_PRINT	0

PRIVATE int LQFpReadOneCharacter(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    FILE *inputFile,
    char *fileName,
    FILE *OutputFile
#endif
);

/** **/

PRIVATE int InWord = 0;

LIBRARY int
LQF_Troff_Copy(db, InputFile, Name, OutputFile)
    t_LQTEXT_Database *db;
    FILE *InputFile;
    char *Name;
    FILE *OutputFile;
{
    int ch;

    InWord = 0;

    while ((ch = LQFpReadOneCharacter(db, InputFile, Name, OutputFile)) != EOF) {
	if (ch != 0) {
	    putc(ch, OutputFile);
	}
    }
    return 0; /* TODO: error return */
}

#define OPEN_PAREN '('

PRIVATE void
OutputChar(ch, Mode, OutputFile)
    int ch;
    int Mode;
    FILE *OutputFile;
{
    if (!ch) {
	return;
    }
    if (Mode == PUTMODE_PRINT) {
	putc(ch, OutputFile);
    } else {
	if (isalnum(ch)) {
	    putc(LQT_CHAR_TO_IGNORE, OutputFile);
	} else {
	    putc(ch, OutputFile);
	}
    }
}

PRIVATE int
doDelim(db, inputFile, Delim, Mode, fileName, OutputFile)
    t_LQTEXT_Database *db;
    FILE *inputFile;
    int Delim;
    int Mode;
    char *fileName;
    FILE *OutputFile;
{
    int ch;

    putc(Delim, OutputFile);

    if (Delim == '[') {
	Delim = ']';
    }

    while ((ch = LQFpReadOneCharacter(db, inputFile, fileName, OutputFile)) != EOF) {
	if (ch == Delim) {
	    putc(Delim, OutputFile);
	    return 0;
	} else if (ch == '\n') {
	    return ch;
	} else if (ch != 0) {
	    OutputChar(ch, Mode, OutputFile);
	}
    }
    return EOF;
}

PRIVATE int
doThingWithDelim(db, ch, inputFile, fileName, OutputFile)
    t_LQTEXT_Database *db;
    int ch;
    FILE *inputFile;
    char *fileName;
    FILE *OutputFile;
{
    putc(ch, OutputFile);

    if ((ch= getc(inputFile)) == EOF) {
	return EOF;
    }
    if (ch == OPEN_PAREN) {
	return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName, OutputFile);
    } else {
	return doDelim(db, inputFile, ch, PUTMODE_PRINT, fileName, OutputFile);
    }
}

PRIVATE int
doThingWithName(db, inputFile, fileName, OutputFile)
    t_LQTEXT_Database *db;
    FILE *inputFile;
    char *fileName;
    FILE *OutputFile;
{
    int ch, ch2, ch3;

    if ((ch = getc(inputFile)) == EOF) {
	return EOF;
    }

    if (ch == OPEN_PAREN) {
	putc(OPEN_PAREN, OutputFile);

	/* \*(xxStuff
	 * Four cases, where w is a wordchar, x isn't
	 * "i" represents char-to-ignore
	 * 1 \*(wwStuff --> \*(xxStuff
	 * 2 \*(xwStuff --> \*(x Stuff
	 * 3 \*(wxStuff --> \*(ixStuff
	 * 4 \*(xxStuff --> \*(xxStuff
	 *    ^ch (w or x, so to speak)
	 *     ^ch2 (x or w...)
	 *      ^ch3 (S here)
	 *
	 * If the sequence is not followed by a word char,
	 * we simply output it in ignore mode.
	 * 
	 */

	ch = getc(inputFile);  /* first char after open paren */
	if (ch == EOF) {
	    return EOF;
	}
	ch2 = getc(inputFile); /* second char after open paren */
	if (ch2 == EOF) {
	    return EOF;
	}

	ch3 = getc(inputFile); /* third char after open paren */
	if (ch3 == EOF) {
	    return EOF;
	}

	/* if it's not followed by a word, easy: */
	if (!LQT_StartsWord(db, ch3) &&
	    !isdigit(ch3)
	) {
	    OutputChar(ch, PUTMODE_IGNORE, OutputFile);
	    OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
	    (void) ungetc(ch3, inputFile);
	    return 0;
	}

	/* now handle the 4 cases */
	if (LQT_StartsWord(db, ch2) || isdigit(ch2)) {
	    /* case 1 or 2 */
	    if (LQT_StartsWord(db, ch) || isdigit(ch)) {
		/* case 1, ww */
		putc(LQT_CHAR_TO_SKIP, OutputFile);
		putc(LQT_CHAR_TO_SKIP, OutputFile);
	    } else {
		/* case 2, xw */
		OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		putc(LQT_CHAR_TO_SKIP, OutputFile);
	    }
	} else {
	    /* cases 3 and 4 have the same action */
	    /* case 3, wx */
	    /* case 4, xx */
	    OutputChar(ch, PUTMODE_IGNORE, OutputFile);
	    OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
	}
	(void) ungetc(ch3, inputFile);
	return 0;
    } else if (ch == '[') {
	return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName, OutputFile);
    } else {
	/*CANTHAPPEN*/
	putc(' ', OutputFile);
    }
    return 0;
}

PRIVATE int
LQFpReadOneCharacter(db, inputFile, fileName, OutputFile)
    t_LQTEXT_Database *db;
    FILE *inputFile;
    char *fileName;
    FILE *OutputFile;
{
    int ch, ch1, ch2, ch3;
    
    while ((ch = getc(inputFile)) != EOF) {

    if (ch == '\\') {
	switch ((ch = getc(inputFile))) {
	    case EOF:
		return EOF;
	    case '\\':
		putc('\\', OutputFile);
		putc('\\', OutputFile);
		break;
	    case '\n':
		/* \ at the end of the line joins the lines together */
		putc('\\', OutputFile);
		putc(ch, OutputFile);
		break;
	    
		/* TODO: join the lines together and adjust the characters
		 * so that we don't move the start of any words.
		 * That's a little tricky.
		 */
	    case '[':
		putc('\\', OutputFile);
		return doDelim(db, inputFile, ch, PUTMODE_IGNORE, fileName,OutputFile);
	    case OPEN_PAREN:
		putc('\\', OutputFile);
		(void) ungetc(OPEN_PAREN, inputFile);
		doThingWithName(db, inputFile, fileName, OutputFile);
		break;
	    case '_':
		putc('\\', OutputFile);
		OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		return 0;
	    case '^': case '`': case '{':  case '|': case '}': case ' ':
	    case '+': case '~': case '#': /* sqtroff and UCB ditroff only */
		putc('\\', OutputFile); putc(ch, OutputFile);
		break;
	    /* unknown escape characters: */
	    case 'C': case 'E': case 'F': case 'G':
	    case 'i': case 'j': case 'J':
	    case 'I': /* Immediate evaluation */
	    case 'K': case 'm': case 'M': case 'N':
	    case 'O': case 'P': case 'q': case 'R':
	    case 'U': case 'V': case 'W': case 'y': case 'Y':
	    default:
		putc('\\', OutputFile); /* TODO swallow the \ */
		putc(ch, OutputFile);
		break;

	    /* self-contained escapes of the form "\c" */
	    case 'a': case 'A': case 'c': case 'd': case 'e': case 'p':
	    case 'r': case 't': case 'u': case 'z': case '0':
		putc('\\', OutputFile);
		{
		    ch2 = getc(inputFile);

		    if (ch2 == EOF) {
			/* drop it */
			return EOF;
		    }

		    if (isalnum(ch2)) {
			putc(LQT_CHAR_TO_SKIP, OutputFile);
			putc(ch2, OutputFile);
			return 0;
		    }

		    if (isspace(ch2)) {
			(void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
			putc(ch2, OutputFile);
			return 0;
		    }

		    putc(LQT_CHAR_TO_IGNORE, OutputFile);
		    ungetc(ch2, inputFile);
		}
		return 0;

	    /* escapes with an argument, \c'value' or \c[value] */
	    case 'B': case 'b': case 'D': 
	    case 'H': case 'h':
	    case 'l': case 'L': case 'o':
	    case 'S': /* slant */
	    case 'T': /* what this?? */
	    case 'v': case 'w': case 'x': case 'X':
		putc('\\', OutputFile);

		return doThingWithDelim(
		    db, ch, inputFile, fileName, OutputFile
		);

	    /* escapes with a name, \cx or \c[xxxx] or \c(xx */
	    case '*':
	    case 'Q': /* \Q is for sqtroff only, reads a qonfig variable */
	    case 'f': case 'g': case 'k':
		putc('\\', OutputFile);

		if ((ch1 = getc(inputFile)) == EOF) return EOF;

		if (ch1 == OPEN_PAREN || ch1 == '[') {
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); /* f */
		    (void) ungetc(ch1, inputFile);
		    return doThingWithName(db, inputFile, fileName, OutputFile);
		} else {
		    /* e.g. \fR */

		    if ((ch2 = getc(inputFile)) == EOF) return EOF;

		    /* turn "\fR\\$1" into "\xx"
		     * where x is char_to_ignore,
		     * so that word counting is preserved,
		     * but
		     * \fRboy
		     * turns into
		     * "\  boy"
		     * since the "boy" will be counted as a word
		     */
		    if (LQT_StartsWord(db, ch2) ||
			isdigit(ch2)
		    ){
			/* \fRboy */
			if (LQT_StartsWord(db, ch1) ||
			    LQT_OnlyWithinWord(db, ch1)
			) {
			    putc(LQT_CHAR_TO_SKIP, OutputFile); /* f */
			    putc(LQT_CHAR_TO_SKIP, OutputFile); /* font name */
			} else {
			    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
			    (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile);
			}
		    } else {
			/* \fR... */
			(void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
			(void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile);
		    }
		    (void) ungetc(ch2, inputFile);
		    return 0;
		}

	    /* special cases: */
	    case 'n': /* number register, \nx */

		putc('\\', OutputFile);

		if ((ch2 = getc(inputFile)) == EOF) return EOF;

		if (ch2 == '+' || ch2 == '-') {
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		    ch = 0;
		    putc(ch2, OutputFile);
		    if ((ch2 = getc(inputFile)) == EOF) return EOF;
		}

		/* cases:
		 * 1 \n(xx -> handle name
		 * 2 \nwX --> \iiX
		 * 3 \nxX --> \ixX
		 * 4 \nwW --> \  W
		 * 5 \nxW --> \ixW
		 *
		 */

		if (ch2 == OPEN_PAREN || ch2 == '[') {
		    /* case 1: put out the 'n' and handle the name */
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		    (void) ungetc(ch2, inputFile);
		    return doThingWithName(db, inputFile, fileName, OutputFile);
		}

		/* read the character after the sequence */
		if ((ch3 = getc(inputFile)) == EOF) return EOF;

		if (!LQT_StartsWord(db, ch3) && !isdigit(ch3)) {
		    /* case 2 or 3 */
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		    (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
		    (void) ungetc(ch3, inputFile);
		    return 0;
		}

		/* case 4 or 5 */

		if (LQT_StartsWord(db, ch2) || isdigit(ch2)) {
		    /* case 4 */
		    putc(LQT_CHAR_TO_SKIP, OutputFile);
		    (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
		} else {
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		    (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
		}
		(void) ungetc(ch3, inputFile);
		return 0;
	    
	    case 's': /* \s[+-][expr], \s[+-](NN, \sNN, \s[+-]N */
		/* \s -- set size
		 * N is 0 or [123][0-9] or [4-9]
		 * NN is >= 40
		 *
		 */
		putc('\\', OutputFile);

		ch1 = getc(inputFile);
		if (ch1 == EOF) {
		    return EOF;
		}

		if (ch1 == '+' || ch1 == '-') {
		    /* \s+9 or \s+(32 */
		    putc(LQT_CHAR_TO_IGNORE, OutputFile); /* the "s" */
		    putc(ch1, OutputFile); /* the + or - */

		    /* so we have handled the \ s and + now */

		    ch = ch1; /* save for error message below */

		    if ((ch1 = getc(inputFile)) == EOF) {
			return EOF;
		    }

		    if (ch1 == OPEN_PAREN || ch1 == '[') {
			(void) ungetc(ch1, inputFile);
			return doThingWithName(db,
			    inputFile, fileName, OutputFile
			);
		    }

		    if (!isdigit(ch1)) {
			if (ch1 == '\\') {
			    /* e.g. \s+\nx */
			    (void) ungetc(ch1, inputFile);
			    return 0;
			}
			Error(E_WARN,
			    "found \\s%c%c unexpectedly!",
			    ch, ch1 /* why we needed to save ch */
			);
			/* but continue */
		    }

		    /* cases:
		     * 1: \s+6w -> \i+ w
		     * 2: \s+6x -> \i+ix
		     */
		    
		    ch2 = getc(inputFile);
		    if (ch2 == EOF) {
			return EOF;
		    }
		    if (LQT_StartsWord(db, ch2) || isdigit(ch2)) {
			/* 1: \s+6w -> \i+ w */
			putc(LQT_CHAR_TO_SKIP, OutputFile);
		    } else { /* 2: \s+6x -> \i+ix */
			(void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile);
		    }
		    (void) ungetc(ch2, inputFile);
		    return 0;
		}

		if (ch1 == OPEN_PAREN || ch1 == '[') {
		    /* put out the 's' and handle the name */
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
		    (void) ungetc(ch1, inputFile);
		    return doThingWithName(db, inputFile, fileName, OutputFile);
		}

		/* OK, so it's \s36 or \s9 or \s0
		 * It could also be \s\*x, but we cannot handle that,
		 * and it's extremely rare
		 */
		
		/* cases:
		 * 1: \s0W --> \  W (where 0 can be 0 5 6 7 or 9)
		 * 2: \s0X --> \i0X
		 * 3: \s12W --> \   W
		 * 4: \s12X --> \i12X
		 */

		ch2 = getc(inputFile);
		if (ch2 == EOF) {
		    return EOF;
		}

		if (ch1 == '0' || (ch1 >= '5' && ch1 <= '9')) {
		    /* case 1 or 2 */
		    if (LQT_StartsWord(db, ch2) ||
			isdigit(ch2)
		    ) {
			/* case 1 */
			if (ch) { /* the s */
			    /* note: \s+3x -> \i+ x; the + was handled above
			     * and the s already ignored correctly
			     */
			    putc(LQT_CHAR_TO_SKIP, OutputFile);
			}
			putc(LQT_CHAR_TO_SKIP, OutputFile); /* the digit */
		    } else {
			/* 2: \s0X --> \i0X */
			(void) OutputChar(ch, PUTMODE_IGNORE, OutputFile);
			(void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile);
		    }
		    (void) ungetc(ch2, inputFile);
		    return 0;
		}

		if (!isdigit(ch2)) {
		    Error(E_WARN,
			"found %c after \\s%c unexpectedly!",
			ch2, ch1
		    );
		    /* might be \s\*x  */
		    (void) ungetc(ch2, inputFile);
		    return 0;
		}

		
		ch3 = getc(inputFile);
		if (ch3 == EOF) {
		    return EOF;
		}

		if (LQT_StartsWord(db, ch3) || isdigit(ch3)) {
		    /* 3: \s12W --> \   W */
		    if (ch) { /* the s */
			/* note: \s+3x -> \i+ x; the + was handled above
			 * and the s already ignored correctly
			 */
			putc(LQT_CHAR_TO_SKIP, OutputFile);
		    }
		    putc(LQT_CHAR_TO_SKIP, OutputFile); /* digit */
		    putc(LQT_CHAR_TO_SKIP, OutputFile); /* digit */
		} else { /* 3: \s12X --> \i12X */
		    (void) OutputChar(ch, PUTMODE_IGNORE, OutputFile); /* s */
		    (void) OutputChar(ch1, PUTMODE_IGNORE, OutputFile);
		    (void) OutputChar(ch2, PUTMODE_IGNORE, OutputFile);
		}
		(void) ungetc(ch3, inputFile);
		return 0;
	    }
	} else {
	    if (ch) {
		return ch;
	    } else {
		/* 0 return, so read more */
		return
		    LQFpReadOneCharacter(db, inputFile, fileName, OutputFile);
	    }
	}
    }

    /*NOTREACHED*/
    if (ch == EOF) {
	return EOF;
    } else {
	return ch;
    }
}