/* SGMLFilter.c -- Copyright 1989, 1993, 1994 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: SGMLFilter.c,v 1.16 2001/05/31 03:48:14 liam Exp $ * * Filter for ISO 8879 (SGML) files * */ #include "error.h" #include #include #include "globals.h" #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_UNISTD_H # include #endif #include "wordrules.h" #include "chartype.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "filter.h" #ifndef START_ONLY_ONCE # define START_ONLY_ONCE { static char _x = 0; if (!_x) { _x = 1; # define END_ONLY_ONCE } } #endif /** Functions in this file that need to be declared **/ INLINE PRIVATE int GetChar( #ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *fd #endif ); /** **/ static char InHeader = 0; /* InHeader is set if the file type is HTML and * we have not seen BODY yet; * we don't index the head, by default. */ static int LastChar = 0; static int InWord = 0; static int LastInWord = 0; PRIVATE void Init() { InHeader = 0; LastChar = 0; InWord = 0; LastInWord = 0; } INLINE static void IgnoreChar(db, ch, OutputFile) t_LQTEXT_Database *db; int ch; FILE *OutputFile; { if (!InWord) { if (isspace(ch) || LQT_ISPUNCT(db, ch)) { putc(ch, OutputFile); } else if (LQT_ISDIGIT(db, ch)) { putc(LQT_DIGIT_TO_IGNORE, OutputFile); } else { putc(' ', OutputFile); } } else if (LQT_ISDIGIT(db, ch)) { putc(LQT_DIGIT_TO_IGNORE, OutputFile); } else { putc(LQT_CHAR_TO_IGNORE, OutputFile); } } INLINE PRIVATE int GetChar(db, fd) t_LQTEXT_Database *db; FILE *fd; { if (LastChar) { int ch = LastChar; InWord = LastInWord; LastChar = 0; return ch; } if ((LastChar = getc(fd)) == EOF) return EOF; LastInWord = InWord; if (InWord == 0) { InWord = LQT_STARTS_WORD(db, LastChar); } else if (!LQT_WITHIN_OR_ENDS_WORD(db, LastChar)) { InWord = 0; } /* Only return a single quote if it is within a word: * can't --- OK * ...hello' he said --- rejected * '' --- rejected * 30's --- OK * 30'66" -- rejected */ if (LastChar == '\'') { LastChar = getc(fd); if (InWord && !LQT_WITHIN_OR_ENDS_WORD(db, LastChar)) { /* trailing ' sign */ LastInWord = InWord = 0; } else { LastInWord = LQT_STARTS_WORD(db, LastChar); } return '\''; } else { int ch = LastChar; LastChar = 0; return ch; } } INLINE PRIVATE void UnGetChar(fd, c) FILE *fd; int c; { if (LastChar) { (void) ungetc(LastChar, stdin); } LastChar = c; LastInWord = InWord; } #define issgmldelim(ch) (isspace(ch) || ch == ';' || ch == '<') LIBRARY int LQF_SGML_Copy(db, InputFile, Name, OutputFile) t_LQTEXT_Database *db; FILE *InputFile; char *Name; FILE *OutputFile; { int WithinATag = 0; int WithinAString = 0; int WithinSpecial = 0; char QuoteChar = 0; char Warned = 0; int ch; Init(); if (db->IgnoreHTMLhead) { InHeader = 1; } else { InHeader = 0; } /* BUG: we should check putc()'s return value to see if * the output file system has filled up. * * But I want to rewrite this so it doesn't need to copy * the file anyway. */ while ((ch = GetChar(db, InputFile)) != EOF) { if (ch == '<' && !WithinATag) { WithinATag = 1; putc('<', OutputFile); switch (ch = GetChar(db, InputFile)) { case EOF: fflush(stdout); Error(E_WARN, "%s: End of file within a tag", Name); return 0; case '!': /* */ WithinSpecial = 1; IgnoreChar(db, ch, OutputFile); continue; case '\n': case ' ': case '\t': case '\r': putc(ch, OutputFile); continue; case '>': if (!Warned) { Error(E_WARN, "%s: Badly formed SGML, found <>", Name); Warned = 1; } return 0; case 'B': case 'b': if (InHeader) { InHeader = 0; } /* fall through */ default: IgnoreChar(db, ch, OutputFile); continue; } } if (WithinATag) { /* special processing for attributes, ! and so forth */ if (WithinAString) { if (ch == QuoteChar) { WithinAString = 0; QuoteChar = 0; InWord = 0; } IgnoreChar(db, ch, OutputFile); continue; } else { /* within a tag but not a string */ while (isspace(ch)) { IgnoreChar(db, ch, OutputFile); if ((ch = GetChar(db, InputFile)) == EOF) { Error(E_WARN, "%s: end of file inside a tag!", Name); return 0; } } switch (ch) { case '"': case '\'': InWord = 0; WithinAString = 1; QuoteChar = ch; break; case '<': /* for DOCTYPE etc... */ if (WithinSpecial) { WithinATag++; } else { if (!Warned) { Error(E_WARN, "%s: < within a tag!", Name); Warned = 1; } } break; case '>': WithinATag--; if (WithinSpecial && WithinATag == 0) { WithinSpecial = 0; } break; case '-': /* TODO NOTDONE FIXME do SGML comments */ default: break; } IgnoreChar(db, ch, OutputFile); continue; } } else { /* not within a tag */ /* TODO: handle entities * this is tricky to get right whilst mantaining the byte count... * variable sized *input* blocks required for this. */ if (ch == '<') { WithinATag++; } else if (ch == '&') { IgnoreChar(db, '&', OutputFile); while ((ch = GetChar(db, InputFile)) != EOF) { if (issgmldelim(ch)) { break; } IgnoreChar(db, ch, OutputFile); } if (ch == EOF) { Error(E_WARN, "%s: end of file inside an entity!", Name ); return 0; } if (ch == ';') { putc(' ', OutputFile); } else { UnGetChar(InputFile, ch); continue; } } else { if (InHeader) { IgnoreChar(db, ch, OutputFile); } else { putc(ch, OutputFile); } } } } return 0; }