/* SGMLFilter.c -- Copyright 1989, 1993, 1994 Liam R. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * $Id: SGMLFilter.c,v 1.16 2001/05/31 03:48:14 liam Exp $
 *
 * Filter for ISO 8879 (SGML) files
 *
 */

#include "error.h"

#include <stdio.h>
#include <sys/types.h>
#include "globals.h"

#include <ctype.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif

#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#else
# include <malloc.h>
#endif

#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif

#include "wordrules.h"
#include "chartype.h"
#include "emalloc.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "filter.h"

#ifndef START_ONLY_ONCE
# define START_ONLY_ONCE { static char _x = 0; if (!_x) { _x = 1;
# define END_ONLY_ONCE } }
#endif

/** Functions in this file that need to be declared **/
INLINE PRIVATE int GetChar(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    FILE *fd
#endif
);

/** **/

static char InHeader = 0;
    /* InHeader is set if the file type is HTML and 
     * we have not seen BODY yet;
     * we don't index the head, by default.
     */
static int LastChar = 0;
static int InWord = 0;
static int LastInWord = 0;

PRIVATE void
Init()
{
    InHeader = 0;
    LastChar = 0;
    InWord = 0;
    LastInWord = 0;
}

INLINE static void
IgnoreChar(db, ch, OutputFile)
    t_LQTEXT_Database *db;
    int ch;
    FILE *OutputFile;
{
    if (!InWord) {
	if (isspace(ch) || LQT_ISPUNCT(db, ch)) {
	    putc(ch, OutputFile);
	} else if (LQT_ISDIGIT(db, ch)) {
	    putc(LQT_DIGIT_TO_IGNORE, OutputFile);
	} else {
	    putc(' ', OutputFile);
	}
    } else if (LQT_ISDIGIT(db, ch)) {
	putc(LQT_DIGIT_TO_IGNORE, OutputFile);
    } else {
	putc(LQT_CHAR_TO_IGNORE, OutputFile);
    }
}

INLINE PRIVATE int
GetChar(db, fd)
    t_LQTEXT_Database *db;
    FILE *fd;
{
    if (LastChar) {
	int ch = LastChar;
	InWord = LastInWord;
	LastChar = 0;
	return ch;
    }

    if ((LastChar = getc(fd)) == EOF) return EOF;

    LastInWord = InWord;

    if (InWord == 0) {
	InWord = LQT_STARTS_WORD(db, LastChar);
    } else if (!LQT_WITHIN_OR_ENDS_WORD(db, LastChar)) {
	InWord = 0;
    }

    /* Only return a single quote if it is within a word:
     * can't --- OK
     * ...hello' he said --- rejected
     * '' --- rejected
     * 30's --- OK
     * 30'66" -- rejected
     */
    if (LastChar == '\'') {
	LastChar = getc(fd);
	if (InWord && !LQT_WITHIN_OR_ENDS_WORD(db, LastChar)) {
	    /* trailing ' sign */
	    LastInWord = InWord = 0;
	} else {
	    LastInWord = LQT_STARTS_WORD(db, LastChar);
	}
	return '\'';
    } else {
	int ch = LastChar;
	LastChar = 0;
	return ch;
    }
}

INLINE PRIVATE void
UnGetChar(fd, c)
    FILE *fd;
    int c;
{
    if (LastChar) {
	(void) ungetc(LastChar, stdin);
    }
    LastChar = c;
    LastInWord = InWord;
}

#define issgmldelim(ch) (isspace(ch) || ch == ';' || ch == '<')

LIBRARY int
LQF_SGML_Copy(db, InputFile, Name, OutputFile)
    t_LQTEXT_Database *db;
    FILE *InputFile;
    char *Name;
    FILE *OutputFile;
{
    int WithinATag = 0;
    int WithinAString = 0;
    int WithinSpecial = 0;
    char QuoteChar = 0;
    char Warned = 0;
    int ch;

    Init();
    if (db->IgnoreHTMLhead) {
	InHeader = 1;
    } else {
	InHeader = 0;
    }

    /* BUG: we should check putc()'s return value to see if 
     * the output file system has filled up.
     *
     * But I want to rewrite this so it doesn't need to copy
     * the file anyway.
     */
    while ((ch = GetChar(db, InputFile)) != EOF) {
	if (ch == '<' && !WithinATag) {
	    WithinATag = 1;
	    putc('<', OutputFile);
	    switch (ch = GetChar(db, InputFile)) {
	    case EOF:
		fflush(stdout);
		Error(E_WARN, "%s: End of file within a tag", Name);
		return 0;
	    case '!': /* <! ..... > */
		WithinSpecial = 1;
		IgnoreChar(db, ch, OutputFile);
		continue;
	    case '\n':
	    case ' ':
	    case '\t':
	    case '\r':
		putc(ch, OutputFile);
		continue;
	    case '>':
		if (!Warned) {
		    Error(E_WARN, "%s: Badly formed SGML, found <>", Name);
		    Warned = 1;
		}
		return 0;
	    case 'B':
	    case 'b':
		if (InHeader) {
		    InHeader = 0;
		}
		/* fall through */
	    default:
		IgnoreChar(db, ch, OutputFile);
		continue;
	    }
	}
	if (WithinATag) {
	    /* special processing for attributes, ! and so forth */
	    if (WithinAString) {
		if (ch == QuoteChar) {
		    WithinAString = 0;
		    QuoteChar = 0;
		    InWord = 0;
		}
		IgnoreChar(db, ch, OutputFile);
		continue;
	    } else {
		/* within a tag but not a string */
		while (isspace(ch)) {
		    IgnoreChar(db, ch, OutputFile);
		    if ((ch = GetChar(db, InputFile)) == EOF) {
			Error(E_WARN, "%s: end of file inside a tag!", Name);
			return 0;
		    }
		}
		switch (ch) {
		case '"': case '\'':
		    InWord = 0;
		    WithinAString = 1;
		    QuoteChar = ch;
		    break;
		case '<': /* for DOCTYPE etc... */
		    if (WithinSpecial) {
			WithinATag++;
		    } else {
			if (!Warned) {
			    Error(E_WARN, "%s: < within a tag!", Name);
			    Warned = 1;
			}
		    }
		    break;
		case '>':
		    WithinATag--;
		    if (WithinSpecial && WithinATag == 0) {
			WithinSpecial = 0;
		    }
		    break;
		case '-': /* TODO NOTDONE FIXME do SGML comments */
		default:
		    break;
		}
		IgnoreChar(db, ch, OutputFile);
		continue;
	    }
	} else { /* not within a tag */
	    /* TODO: handle entities
	     * this is tricky to get right whilst mantaining the byte count...
	     * variable sized *input* blocks required for this.
	     */
	    if (ch == '<') {
		WithinATag++;
	    } else if (ch == '&') {
		IgnoreChar(db, '&', OutputFile);
		while ((ch = GetChar(db, InputFile)) != EOF) {
		    if (issgmldelim(ch)) {
			break;
		    }
		    IgnoreChar(db, ch, OutputFile);
		}
		if (ch == EOF) {
		    Error(E_WARN,
			"%s: end of file inside an entity!",
			Name
		    );
		    return 0;
		}
		if (ch == ';') {
		    putc(' ', OutputFile);
		} else {
		    UnGetChar(InputFile, ch);
		    continue;
		}
	    } else {
		if (InHeader) {
		    IgnoreChar(db, ch, OutputFile);
		} else {
		    putc(ch, OutputFile);
		}
	    }
	}
    }
    return 0;
}