/* NewsFilter.c -- Copyright 1989, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: NewsFilter.c,v 1.15 2001/05/31 03:48:14 liam Exp $ * * Filter for usenet articles. * Throw away all of the header except * Subject * From * Organi[sz]ation * Newsgroups * Keywords * Summary * * Probably ought to keep Message-ID, but I can't store it anyway! * * See FilterMain and wordrules.h for more info. * */ #include "globals.h" #include "error.h" #include #include #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_UNISTD_H # include #endif #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "filter.h" /** C Library functions that need to be declared: **/ #ifndef tolower extern int tolower( #ifdef HAVE_PROTO int ch #endif ); #endif /** lq-text library functions that need to be declared **/ /** Functions in this file that need to be declared **/ PRIVATE void PutLine( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Line, int Ignore, FILE *OutputFile #endif ); #define PUTMODE_IGNORE 1 #define PUTMODE_PRINT 0 PRIVATE void Header( #ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *InputFile, char *Name, FILE *OutputFile #endif ); PRIVATE void Body( #ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *InputFile, char *Name, FILE *OutputFile #endif ); /** **/ char *KeepThese[] = { /* these must be sorted on the first character */ "From", "Keywords", "Newsgroups", "Organisation", "Organization", "Summary", "Subject", 0 }; PRIVATE INLINE int FirstWord(Line, Word) char *Line; char *Word; { int n = strlen(Word); if (strncmp(Line, Word, n) == 0) { return (Line[n] == '\0' || isspace(Line[n]) || ispunct(Line[n])); } return 0; } PRIVATE int IsWanted(Line) char *Line; { char **pp; int ch = Line[0]; if (isupper(ch)) ch = tolower(ch); for (pp = KeepThese; *pp && **pp; pp++) { if (**pp > *Line) return 0; /* gone too far */ else if (FirstWord(Line, *pp)) return 1; } return 0; } LIBRARY int LQF_NetNews_Copy(db, InputFile, Name, OutputFile) t_LQTEXT_Database *db; FILE *InputFile; char *Name; FILE *OutputFile; { Header(db, InputFile, Name, OutputFile); Body(db, InputFile, Name, OutputFile); return 0; /* TODO: error checking */ } PRIVATE int InWord = 0; PRIVATE void Header(db, InputFile, Name, OutputFile) t_LQTEXT_Database *db; FILE *InputFile; char *Name; FILE *OutputFile; { char *Line; while (LQU_fReadLine(InputFile, &Line, 0) != -1) { InWord = 0; if (!Line || !*Line || (Line[1] == '\n' && !Line[2])) { putc('\n', OutputFile); return; /* blank line is end of header */ } if (!IsWanted(Line)) { PutLine(db, Line, PUTMODE_IGNORE, OutputFile); } else { PutLine(db, Line, PUTMODE_PRINT, OutputFile); } } Error(E_WARN, "%s: warning: News article with no body.", Name); return; } PRIVATE char SharCharacter = 0; PRIVATE void PutLine(db, Line, Ignore, OutputFile) t_LQTEXT_Database *db; char *Line; int Ignore; FILE *OutputFile; { register char *p; InWord = 0; switch (Ignore) { case PUTMODE_PRINT: Ignore = 0; fputs(Line, OutputFile); putc('\n', OutputFile); return; break; case PUTMODE_IGNORE: Ignore = 1; break; default: Error(E_FATAL|E_BUG, "PutLine(\"%8.8s...\", %d not in {%d,%d})", Line, Ignore, PUTMODE_IGNORE, PUTMODE_PRINT ); } for (p = Line; *p; p++) { if (*p == '\n') { InWord = 0; putc(*p, OutputFile); } else if (InWord || (LQT_OnlyWithinWord(db, *p) && LQT_EndsWord(db, p[1])) ) { if (LQT_EndsWord(db, *p)) { if (Ignore) { if (LQT_ISDIGIT(db, *p)) { putc(LQT_DIGIT_TO_IGNORE, OutputFile); } else { putc(LQT_CHAR_TO_IGNORE, OutputFile); } } else { putc(*p, OutputFile); } } else { putc(*p, OutputFile); InWord = 0; } } else { if (LQT_StartsWord(db, *p)) { InWord = 1; if (Ignore) { if (LQT_ISDIGIT(db, *p)) { putc(LQT_DIGIT_TO_IGNORE, OutputFile); } else { putc(LQT_CHAR_TO_IGNORE, OutputFile); } } else { putc(*p, OutputFile); } } else if (isdigit(*p)) { putc(*p, OutputFile); while(*++p && ( isdigit(*p) || *p=='.' || LQT_EndsWord(db, *p) || LQT_OnlyWithinWord(db, *p) )) { putc(*p, OutputFile); } --p; /* gone too far */ } else { putc(*p, OutputFile); } } } if (p == Line || p[-1] != '\n') { putc('\n', OutputFile); } } /* Flags for LineState: */ #define LS_NORMAL 00 #define LS_UUENCODE 01 #define LS_SHAR 02 /* can be combined with UUENCODE */ PRIVATE void Body(db, InputFile, Name, OutputFile) t_LQTEXT_Database *db; FILE *InputFile; char *Name; FILE *OutputFile; { register char *p; char *Line; int LineState = 0; int CheckForShar = 0; /* seen a line starting w/ "-" "#!" or ":" recently */ char *EOFStr = 0; while (LQU_fReadLine(InputFile, &Line, 0) != -1) { if (!Line || !*Line) { putc('\n', OutputFile); continue; } p = Line; /* When material is quoted with > or |, ignore it */ if (!SharCharacter && (*p == '>' || *p == '|')) { while (*p == '>' || *p == '|' || isspace(*p)) { putc(' ', OutputFile); Line = ++p; } } if (CheckForShar) { ++CheckForShar; if (FirstWord(Line, "sed") || FirstWord(Line, "cat")) { register char *q; for (q = Line; *q; q++) { if (*q == '<' && q[1] == '<') { int HasQuotes = 0; LineState |= LS_SHAR; CheckForShar = 0; q++; q++; /* skip the << */ while (isspace(*q)) q++; /* cat > file << 'word', but there are several * alternate forms. We ignore <<- because it's not * portable enough for a shar. * A \ can be used instead of a quote, but in this * case there can be no space in the word. */ if (*q == '\'' || *q == '"') { HasQuotes = (*q); q++; } else if (*q == '\\') { q++; } p = q; while (*q && *q != '\n') { if (!HasQuotes && isspace(*q)) break; else if (*q == HasQuotes) break; q++; } EOFStr = emalloc("NewsFilter:sharEOF",q - p + 1); (void) strncpy(EOFStr, p, q - p); EOFStr[q - p] = '\0'; /* determine the shar character, usually an X; * we look for something like "s/^X//" */ for (q = Line; *q; q++) { if (*q == 's' && (q[1] == '/' || ispunct(q[1])) && q[2] == '^' && q[3] && !isspace(q[3]) && q[4] == q[1] && q[5] == q[1]) { SharCharacter = q[3]; q[3] = ' '; /* don't index it! */ break; } } /* for */ break; } /* if << */ } /* for q = Line... */ } /* if FirstWord is sed or cat */ if (CheckForShar > 30) { /* No << on the line, so not the start of a shar */ CheckForShar = 0; } } /* end of check for shar */ p = Line; if (LineState & LS_SHAR) { if (EOFStr) { int n = strlen(EOFStr); if (strncmp(Line, EOFStr, n) == 0) { if (!Line[n] || Line[n] == '\n') { LineState &= ~LS_SHAR; CheckForShar = 1; efree(EOFStr); EOFStr = (char *) NULL; SharCharacter = 0; } } } if (SharCharacter && *p == SharCharacter) { *p = ' '; putc(' ', OutputFile); Line = ++p; } } else { if (Line[0] == ':' || Line[0] == '#' || (Line[0] == '-' && Line[2] == '-' && Line[3] == '-')) { /* check for --- rather than "--" as .signature starts * with "-- ", except people who add a signature by hand * might forget the space. */ CheckForShar = 1; } } if (LineState & LS_UUENCODE) { /* check for "end" and index that */ if (*p == 'e' && p[1] == 'n' && p[2] == 'd' && (!p[3] || isspace(p[3]))) { LineState &= ~LS_UUENCODE; /* fall through */ } } /* Now we've determined whether we're in a shar or not, * and also whether we are in uuencoded drivel. * Furthermore, if we are in a shar, we have determined that * the current line is not the last of the current file within the * shar archive, and have removed the first character if appropriate. */ if (LineState & LS_UUENCODE) { PutLine(db, Line, PUTMODE_IGNORE, OutputFile); } else { /* look for "begin mode filename" */ register char *q; /* first, print the line */ PutLine(db, Line, PUTMODE_PRINT, OutputFile); /* now, look for the start of uuencoded material */ if (FirstWord(Line, "begin")) { q = &Line[5]; /* skip over the "begin" */ if (isspace(*q)) { q++; if (isdigit(*q)) { while (isdigit(*q) && *q != '8' && *q != '9') { q++; } if (*q == ' ' && *++q) { /* found it! */ LineState |= LS_UUENCODE; } } } } } /* else !LS_UUENCODE */ } /* while fReadLine */ return; }