/* NewsFilter.c -- Copyright 1989, 1994, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * $Id: NewsFilter.c,v 1.15 2001/05/31 03:48:14 liam Exp $
 *
 * Filter for usenet articles.
 * Throw away all of the header except
 * Subject
 * From
 * Organi[sz]ation
 * Newsgroups
 * Keywords
 * Summary
 *
 * Probably ought to keep Message-ID, but I can't store it anyway!
 *
 * See FilterMain and wordrules.h for more info.
 *
 */

#include "globals.h"
#include "error.h"

#include <stdio.h>
#include <ctype.h>
#include <sys/types.h>

#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif

#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#else
# include <malloc.h>
#endif

#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif

#include "wordrules.h"
#include "emalloc.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "filter.h"

/** C Library functions that need to be declared: **/
#ifndef tolower
 extern int tolower(
#ifdef HAVE_PROTO
    int ch
#endif
 );
#endif

/** lq-text library functions that need to be declared **/

/** Functions in this file that need to be declared **/

PRIVATE void PutLine(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    char *Line,
    int Ignore,
    FILE *OutputFile
#endif
);

#define PUTMODE_IGNORE	1
#define PUTMODE_PRINT	0

PRIVATE void Header(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    FILE *InputFile,
    char *Name,
    FILE *OutputFile
#endif
);

PRIVATE void Body(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    FILE *InputFile,
    char *Name,
    FILE *OutputFile
#endif
);

/** **/

char *KeepThese[] = { /* these must be sorted on the first character */
    "From",
    "Keywords",
    "Newsgroups",
    "Organisation",
    "Organization",
    "Summary",
    "Subject",
    0
};

PRIVATE INLINE int
FirstWord(Line, Word)
    char *Line;
    char *Word;
{
    int n = strlen(Word);

    if (strncmp(Line, Word, n) == 0) {
	return (Line[n] == '\0' || isspace(Line[n]) || ispunct(Line[n]));
    }
    return 0;
}

PRIVATE int
IsWanted(Line)
    char *Line;
{
    char **pp;
    int ch = Line[0];

    if (isupper(ch)) ch = tolower(ch);

    for (pp = KeepThese; *pp && **pp; pp++) {
	if (**pp > *Line) return 0; /* gone too far */
 	else if (FirstWord(Line, *pp)) return 1;
    }
    return 0;
}

LIBRARY int
LQF_NetNews_Copy(db, InputFile, Name, OutputFile)
    t_LQTEXT_Database *db;
    FILE *InputFile;
    char *Name;
    FILE *OutputFile;
{
    Header(db, InputFile, Name, OutputFile);
    Body(db, InputFile, Name, OutputFile);
    return 0; /* TODO: error checking */
}

PRIVATE int InWord = 0;

PRIVATE void
Header(db, InputFile, Name, OutputFile)
    t_LQTEXT_Database *db;
    FILE *InputFile;
    char *Name;
    FILE *OutputFile;
{
    char *Line;

    while (LQU_fReadLine(InputFile, &Line, 0) != -1) {
	InWord = 0;

	if (!Line || !*Line || (Line[1] == '\n' && !Line[2])) {
	    putc('\n', OutputFile);
	    return; /* blank line is end of header */
	}

	if (!IsWanted(Line)) {
	    PutLine(db, Line, PUTMODE_IGNORE, OutputFile);
	} else {
	    PutLine(db, Line, PUTMODE_PRINT, OutputFile);
	}
    }
    Error(E_WARN, "%s: warning: News article with no body.", Name);
    return;
}

PRIVATE char SharCharacter = 0;

PRIVATE void
PutLine(db, Line, Ignore, OutputFile)
    t_LQTEXT_Database *db;
    char *Line;
    int Ignore;
    FILE *OutputFile;
{
    register char *p;

    InWord = 0;

    switch (Ignore) {
    case PUTMODE_PRINT:
	Ignore = 0;
	fputs(Line, OutputFile);
	putc('\n', OutputFile);
	return;
	break;
    case PUTMODE_IGNORE:
	Ignore = 1;
	break;
    default:
	Error(E_FATAL|E_BUG, "PutLine(\"%8.8s...\", %d not in {%d,%d})",
		Line, Ignore, PUTMODE_IGNORE, PUTMODE_PRINT
	);
    }

    for (p = Line; *p; p++) {
	if (*p == '\n') {
	    InWord = 0;
	    putc(*p, OutputFile);
	} else if (InWord ||
	    (LQT_OnlyWithinWord(db, *p) && LQT_EndsWord(db, p[1]))
	) {
	    if (LQT_EndsWord(db, *p)) {
		if (Ignore) {
		    if (LQT_ISDIGIT(db, *p)) {
			putc(LQT_DIGIT_TO_IGNORE, OutputFile);
		    } else {
			putc(LQT_CHAR_TO_IGNORE, OutputFile);
		    }
		} else {
		    putc(*p, OutputFile);
		}
	    } else {
		putc(*p, OutputFile);
		InWord = 0;
	    }
	} else {
	    if (LQT_StartsWord(db, *p)) {
		InWord = 1;
		if (Ignore) {
		    if (LQT_ISDIGIT(db, *p)) {
			putc(LQT_DIGIT_TO_IGNORE, OutputFile);
		    } else {
			putc(LQT_CHAR_TO_IGNORE, OutputFile);
		    }
		} else {
		    putc(*p, OutputFile);
		}
	    } else if (isdigit(*p)) {
		putc(*p, OutputFile);
		while(*++p && (
		    isdigit(*p) ||
		    *p=='.' ||
		    LQT_EndsWord(db, *p) ||
		    LQT_OnlyWithinWord(db, *p)
		)) {
		    putc(*p, OutputFile);
		}
		--p; /* gone too far */
	    } else {
		putc(*p, OutputFile);
	    }
	}
    }
    if (p == Line || p[-1] != '\n') {
	putc('\n', OutputFile);
    }
}

/* Flags for LineState:
 */
#define LS_NORMAL	00
#define LS_UUENCODE	01
#define LS_SHAR		02 /* can be combined with UUENCODE */

PRIVATE void
Body(db, InputFile, Name, OutputFile)
    t_LQTEXT_Database *db;
    FILE *InputFile;
    char *Name;
    FILE *OutputFile;
{
    register char *p;
    char *Line;
    int LineState = 0;
    int CheckForShar = 0; /* seen a line starting w/ "-" "#!" or ":" recently */
    char *EOFStr = 0;

    while (LQU_fReadLine(InputFile, &Line, 0) != -1) {
	if (!Line || !*Line) {
	    putc('\n', OutputFile);
	    continue;
	}

	p = Line;

	/* When material is quoted with > or |, ignore it */
	if (!SharCharacter && (*p == '>' || *p == '|')) {
	    while (*p == '>' || *p == '|' || isspace(*p)) {
		putc(' ', OutputFile);
		Line = ++p;
	    }
	}

	if (CheckForShar) {
	    ++CheckForShar;
	    if (FirstWord(Line, "sed") || FirstWord(Line, "cat")) {
		register char *q;

		for (q = Line; *q; q++) {
		    if (*q == '<' && q[1] == '<') {
			int HasQuotes = 0;

			LineState |= LS_SHAR;
			CheckForShar = 0;
			q++; q++; /* skip the << */
			while (isspace(*q)) q++;

			/* cat > file << 'word', but there are several
			 * alternate forms.  We ignore <<- because it's not
			 * portable enough for a shar.
			 * A \ can be used instead of a quote, but in this
			 * case there can be no space in the word.
			 */

			if (*q == '\'' || *q == '"') {
			    HasQuotes = (*q);
			    q++;
			} else if (*q == '\\') {
			    q++;
			}
			p = q;
			while (*q && *q != '\n') {
			    if (!HasQuotes && isspace(*q)) break;
			    else if (*q == HasQuotes) break;
			    q++;
			}
			EOFStr = emalloc("NewsFilter:sharEOF",q - p + 1);
			(void) strncpy(EOFStr, p, q - p);
			EOFStr[q - p] = '\0';

			/* determine the shar character, usually an X;
			 * we look for something like "s/^X//"
			 */
			for (q = Line; *q; q++) {
			    if (*q == 's' &&
					(q[1] == '/' || ispunct(q[1])) &&
					q[2] == '^' &&
					q[3] && !isspace(q[3]) &&
					q[4] == q[1] &&
					q[5] == q[1]) {
				SharCharacter = q[3];
				q[3] = ' '; /* don't index it! */
				break;
			    }
			} /* for */
			break;
		    } /* if << */
		} /* for q = Line... */
	    } /* if FirstWord is sed or cat */

	    if (CheckForShar > 30) {
		/* No << on the line, so not the start of a shar */
		CheckForShar = 0;
	    }
	}
	/* end of check for shar */

	p = Line;

	if (LineState & LS_SHAR) {
	    if (EOFStr) {
		int n = strlen(EOFStr);
		if (strncmp(Line, EOFStr, n) == 0) {
		    if (!Line[n] || Line[n] == '\n') {
			LineState &= ~LS_SHAR;
			CheckForShar = 1;
			efree(EOFStr);
			EOFStr = (char *) NULL;
			SharCharacter = 0;
		    }
		}
	    }
	    if (SharCharacter && *p == SharCharacter) {
		*p = ' ';
		putc(' ', OutputFile);
		Line = ++p;
	    }
	} else {
	    if (Line[0] == ':' || Line[0] == '#' ||
			(Line[0] == '-' && Line[2] == '-' && Line[3] == '-')) {
		/* check for --- rather than "--" as .signature starts
		 * with "-- ", except people who add a signature by hand
		 * might forget the space.
		 */
		CheckForShar = 1;
	    }
	}

	if (LineState & LS_UUENCODE) { /* check for "end" and index that */
	    if (*p == 'e' && p[1] == 'n' && p[2] == 'd' &&
		    (!p[3] || isspace(p[3]))) {
		LineState &= ~LS_UUENCODE;
		/* fall through */
	    }
	}
	
	/* Now we've determined whether we're in a shar or not,
	 * and also whether we are in uuencoded drivel.
	 * Furthermore, if we are in a shar, we have determined that
	 * the current line is not the last of the current file within the
	 * shar archive, and have removed the first character if appropriate.
	 */

	if (LineState & LS_UUENCODE) {
	    PutLine(db, Line, PUTMODE_IGNORE, OutputFile);
	} else {
	    /* look for "begin mode filename" */
	    register char *q;

	    /* first, print the line */
	    PutLine(db, Line, PUTMODE_PRINT, OutputFile);

	    /* now, look for the start of uuencoded material */
	    if (FirstWord(Line, "begin")) {
		q = &Line[5]; /* skip over the "begin" */
		if (isspace(*q)) {
		    q++;
		    if (isdigit(*q)) {
			while (isdigit(*q) && *q != '8' && *q != '9') {
			    q++;
			}
			if (*q == ' ' && *++q) {
			    /* found it! */
			    LineState |= LS_UUENCODE;
			}
		    }
		}
	    }
	} /* else !LS_UUENCODE */
    } /* while fReadLine */

    return;
}