/* filtertype.c -- Copyright 1989, 1992, 1994, 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* FilterType -- determine how to deal with a given file. * Part of Liam Quin's LQ-Text text retrieval package. * * $Id: filters.c,v 1.32 1996/08/14 16:55:00 lee Exp lee $ * */ #include "error.h" #include "globals.h" #include #include "emalloc.h" #include #include #include #include #ifdef HAVE_FCNTL_H #include #endif #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #include "fileinfo.h" #include "wordrules.h" /* for min word length -- don't index files shorter */ #include "lqutil.h" #include "liblqtext.h" #define FILTERDEF /* see filter.h */ #include "filter.h" #include "lqtrace.h" #define Prefix(pref,str) ((*(pref)==(*str))&&!strncmp(pref,str,strlen(pref))) /* The current filter types are: * FTYPE_NEWS 1 * FTYPE_MAIL 2 * FTYPE_MOSTLYASCII 4 * FTYPE_C_SOURCE 5 -- unimplemented for now * FTYPE_SGML 6 */ /* LQT_InitFilterTable might one day be called from Defaults.c.... * At which point, it will read an ascii file that describes the * various filters, I suppose. */ struct s_FilterTable *LQTpFilterTable; LIBRARY void LQTpInitReadOnlyPartOfFilterTable(db) t_LQTEXT_Database *db; { LQTpFilterTable = (t_FilterTable *) emalloc( "Filter table", sizeof(t_FilterTable) * (LQT_MaxFilterType(db) + 2) ); LQTpFilterTable[0].Type = 0; LQTpFilterTable[0].Name = "plain"; LQTpFilterTable[0].copyFile = 0; LQTpFilterTable[0].findMatchEnds = 0; LQTpFilterTable[0].findFile = 0; LQTpFilterTable[0].closeFile = fclose; LQTpFilterTable[FTYPE_SGML].Type = FTYPE_SGML; LQTpFilterTable[FTYPE_SGML].Name = "SGML"; LQTpFilterTable[FTYPE_SGML].copyFile = 0; LQTpFilterTable[FTYPE_SGML].findMatchEnds = 0; LQTpFilterTable[FTYPE_SGML].findFile = 0; LQTpFilterTable[FTYPE_SGML].closeFile = fclose; LQTpFilterTable[FTYPE_NEWS].Type = FTYPE_NEWS; LQTpFilterTable[FTYPE_NEWS].Name = "netnews"; LQTpFilterTable[FTYPE_NEWS].copyFile = 0; LQTpFilterTable[FTYPE_NEWS].findMatchEnds = 0; LQTpFilterTable[FTYPE_NEWS].findFile = 0; LQTpFilterTable[FTYPE_NEWS].closeFile = fclose; LQTpFilterTable[FTYPE_MAIL].Type = FTYPE_MAIL; LQTpFilterTable[FTYPE_MAIL].Name = "RFC822 email"; LQTpFilterTable[FTYPE_MAIL].copyFile = 0; LQTpFilterTable[FTYPE_MAIL].findMatchEnds = 0; LQTpFilterTable[FTYPE_MAIL].findFile = 0; LQTpFilterTable[FTYPE_MAIL].closeFile = fclose; LQTpFilterTable[FTYPE_DEFAULT].Type = FTYPE_DEFAULT; LQTpFilterTable[FTYPE_DEFAULT].Name = "default"; LQTpFilterTable[FTYPE_DEFAULT].copyFile = 0; LQTpFilterTable[FTYPE_DEFAULT].findMatchEnds = 0; LQTpFilterTable[FTYPE_DEFAULT].findFile = 0; LQTpFilterTable[FTYPE_DEFAULT].closeFile = fclose; LQTpFilterTable[FTYPE_TROFF].Type = FTYPE_TROFF; LQTpFilterTable[FTYPE_TROFF].Name = "troff"; LQTpFilterTable[FTYPE_TROFF].copyFile = 0; LQTpFilterTable[FTYPE_TROFF].findMatchEnds = 0; LQTpFilterTable[FTYPE_TROFF].findFile = 0; LQTpFilterTable[FTYPE_TROFF].closeFile = fclose; /* If you add more, you MUST update LQT_MaxFilterType in h/filter.h!!! * ALWAYS ADD AT THE END or you will break existing databases! * See the LQText Filter Writer's Guide in the API documentation * for more information about writing filters. The filter API * will change in the next release, mostly for efficiency, but * also to allow the filters to add words directly instead of * making an ASCII surrogate file that is then indexed. */ /* a sentinel at the very end */ LQTpFilterTable[FTYPE_TROFF + 1].Type = 0; LQTpFilterTable[FTYPE_TROFF + 1].Name = 0; LQTpFilterTable[FTYPE_TROFF + 1].copyFile = 0; LQTpFilterTable[FTYPE_TROFF + 1].findMatchEnds = 0; LQTpFilterTable[FTYPE_TROFF + 1].findFile = 0; LQTpFilterTable[FTYPE_TROFF + 1].closeFile = 0; } /* * LQT_MakeInput * Database/Update, Database/Documents * *

Opens the document referred to by the given FileInfo for reading, * using external input filters if necessary.

. *

The returned stdio stream may refer to a pipe or to a file; * use LQT_DestroyFileInfo to close it and reclaim the memory.

* * A stdio stream open for reading, or NULL on error. * * Issues an error if a required external filter could not be started. * * LQT_MakeFileInfo * LQT_DestroyFileInfo * LQT_GetFilterType *
*/ API FILE * LQT_MakeInput(db, FileInfo) t_LQTEXT_Database *db; t_FileInfo *FileInfo; { FILE *fp; if (FileInfo->FilterType > LQT_MaxFilterType(db)) { Error(E_WARN, "filter type %d for %s too high (max %d)", FileInfo->FilterType, FileInfo->Name, LQT_MaxFilterType(db)); return (FILE *) 0; } if (LQTpFilterTable[FileInfo->FilterType].Type != FileInfo->FilterType) { Error(E_FATAL|E_INTERNAL, "Filter table entry %d has type %d, expected %d", FileInfo->FilterType, LQTpFilterTable[FileInfo->FilterType].Type, FileInfo->FilterType ); } /* open the file, checking for gzip/compress: */ { int fd; if ((fd = LQT_UnpackAndOpen(db, FileInfo->Name)) < 0) { return (FILE *) NULL; } fp = fdopen(fd, "r"); } if (!LQTpFilterTable[FileInfo->FilterType].copyFile) { /* There was no special filter, so just open the file */ return fp; } else { /* In this case, we have to call the filter. * We make a temporary file, and * run the filter into that. * Then we open the temporary file. * Then we unlink the file. * Then we return the open file descrioptor. * * PORT: it's possible to set the close function in the * filter table to one that knows how to close a tmp file. * In this case, you'd need to save the tmp file name in the * FileInfo structure (add to h/fileinfo.h, LQT_MakeFileInfo and * LQT_DestroyFileInfo). Be careful not to remove the actual * data file whose name is also stored in the FileInfo!!!!!! */ char *theTmpFileName = tmpnam((char *) NULL); /* tmpnam returns a pointer to a private buffer that * gets overwritten each time. * It allows $TMPDIR to override the default directory in * which to create the file. */ FILE *tmpf; tmpf = fopen(theTmpFileName, "w+"); if (!tmpf) { Error(E_WARN|E_SYS, "%s: can't create tmp file for filter, sorry", FileInfo->Name ); return (FILE *) NULL; } (* LQTpFilterTable[FileInfo->FilterType].copyFile)( db, fp, FileInfo->Name, tmpf ); /* TODO: check for error return */ (void) fclose(fp); /* Try to rewind the file, and, if that fails it, try to reopen it, * which is slower. * We use fseek() rather than rewind() because fseek returns -1 on * error, whereas the return value of rewind() is undocumented (on * SunOS 4.1 at least, and possibly other systems). */ (void) fflush(tmpf); if (fseek(tmpf, 0L, 0) == -1) { (void) fclose(tmpf); tmpf = fopen(theTmpFileName, "r"); /* don't use fEopen because it calls IsDir which does a * stat(), and that's too slow! */ if (!tmpf) { Error(E_WARN|E_SYS, "%s: can't read tmp file for filter, sorry", FileInfo->Name ); (void) unlink(theTmpFileName); return (FILE *) NULL; } } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_FILTER_DATA)) { int ch; fprintf(stderr, "\n**** start filtered data %s\n", FileInfo->Name); while ((ch = getc(tmpf)) != EOF) { fprintf(stderr, "%c", ch); } fprintf(stderr, "**** end filtered data %s\n\n", FileInfo->Name); if (fseek(tmpf, 0L, 0) == -1) { (void) fclose(tmpf); tmpf = LQU_fEopen(E_WARN, theTmpFileName, "filter output", "r"); if (!tmpf) { Error(E_WARN, "%s: can't read tmp file for filter, sorry", FileInfo->Name ); (void) unlink(theTmpFileName); return (FILE *) NULL; } } } #endif /* unlink the file */ (void) unlink(theTmpFileName); /* return the fd */ return tmpf; } /*NOTREACHED*/ } /* * LQT_GetFilterName * Database/Documents * * Returns a short name describing the file type associated with * the given file. * The value is static, and should not be freed by the caller. * * LQT_GetFilterType * */ LIBRARY char * LQT_GetFilterName(db, FileInfo) t_LQTEXT_Database *db; t_FileInfo *FileInfo; { if (!FileInfo) { Error(E_WARN|E_BUG, "LQT_GetFilterType called with NULL FileInfo" ); return "(null)"; } if (FileInfo->FilterType < 0 || FileInfo->FilterType > LQT_MaxFilterType(db) ) { Error(E_WARN|E_BUG, "LQT_GetFilterType: invalid file type %d", FileInfo->FilterType ); return "(invalid)"; } if (!LQTpFilterTable) { LQTpInitReadOnlyPartOfFilterTable(db); } return LQTpFilterTable[FileInfo->FilterType].Name; } /* * LQT_GetFilterType * Database/Documents * * Determines the appropriate filter to use to read the file represented * by the given FileInfo; this is an internal routine and will be * replaced in the next release. * * LQT_UnpackAndOpen * */ LIBRARY int LQT_GetFilterType(db, FileInfo, StatBuf) t_LQTEXT_Database *db; t_FileInfo *FileInfo; struct stat *StatBuf; { struct stat SpareStatBuf; int Type = LQT_MaxFilterType(db) + 1; char Buffer[1024]; /* for reading a chunk of the file */ int AmountRead = 0; /* initialised for lint */ int fd = 0; /* LQT_GetFilterType() is called to determine which input filter (if any) * should be used to read a given file. * This routine should know about compressed files. * I'm ashamed of this routine. If you see this, I'll give you * the socks I'm wearing, if any. * * It currently knows about mail, news and SGML files. * This file should be dynamic. * * If the file should not be indexed at all (e.g. it's a core dump), * we return -1. */ if (!FileInfo || !FileInfo->Name || !*(FileInfo->Name)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "LQT_GetFilter passed NULL or Empty Fileinfo" ); return (-1); } #ifdef FTYPE_SGML /* ISO 8879: Standard Generalised Markup Language */ { /* look for file names ending in .sgm(l) or .html(l) */ register char *p; char *dot = 0; for (p = FileInfo->Name; *p; p++) { if (*p == '.') { dot = p; } } if (dot) { --p; /* step back over the NUL */ dot++; /* stop over the dot */ if (*p == 'l' && p - dot == 3) { if (STREQ(dot, "html") || STREQ(dot, "sgml")) { Type = FTYPE_SGML; return (FileInfo->FilterType = Type); } } else if (*p == 'm' && p - dot == 2) { if (STREQ(dot, "htm") || STREQ(dot, "sgm")) { Type = FTYPE_SGML; return (FileInfo->FilterType = Type); } } } } #endif /* first crack at SGML */ if (!StatBuf) { if (stat(FileInfo->Name, &SpareStatBuf) < 0) { if ((fd = LQT_UnpackAndOpen(db, FileInfo->Name)) < 0) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "unpackAndOpen(%s) failed", FileInfo->Name ); return (-1); } } StatBuf = &SpareStatBuf; } if (StatBuf->st_size < db->MinWordLength) { return -1; } if (!fd) { fd = LQT_UnpackAndOpen(db, FileInfo->Name); if (fd < 0) { fd = LQU_Eopen(E_WARN, FileInfo->Name, "input file", O_RDONLY, 0); if (fd < 0) { return -1; } } } AmountRead = read(fd, Buffer, sizeof(Buffer)); (void) close(fd); if (AmountRead < db->MinWordLength) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s too small -- not indexed", FileInfo->Name ); return -1; } #ifdef FTYPE_TROFF if (Buffer[0] == '.' || Buffer[0] == '\'') { Type = FTYPE_TROFF; return (FileInfo->FilterType = Type); } #endif #ifdef FTYPE_SGML /* ISO 8879: Standard Generalised Markup Language */ /* Assume that FilterType = Type); } #endif /* Try RFC-822 mail, or Usenet news. * mail files start with From; * news starts with From, Path or Relay-Version */ if (isupper(Buffer[0])) { Buffer[AmountRead] = '\0'; AmountRead--; if (Prefix("Xref: ", Buffer)) { return (FileInfo->FilterType = FTYPE_NEWS); } else if (Prefix("Newsgroups: ", Buffer)) { return (FileInfo->FilterType = FTYPE_NEWS); } else if (Prefix("Relay-Version: ", Buffer)) { return (FileInfo->FilterType = FTYPE_NEWS); } else if (Prefix("From", Buffer)) { if (LQU_StringContainedIn("\nPath: ", Buffer)) { /* bug: should only check header, not body! */ return FTYPE_NEWS; } else { return FTYPE_MAIL; } } else if (Prefix("Path: ", Buffer)) { if (LQU_StringContainedIn("\nNewsgroups: ", Buffer)) { return FTYPE_NEWS; } else { return FTYPE_MAIL; } } else if (Prefix("Return-Path: ", Buffer)) { return FTYPE_MAIL; /* MH-style mail */ } } #ifdef FTYPE_C_SOURCE /* look for C, trying not to get muddled up with shell scripts */ ch = FileInfo->Name[Length - 1]; if ((ch == 'c' || ch == 'h') && (Length > 2) && FileInfo->Name[Length - 2] == '.') { /* We could require one of * . a comment * . a #[ ^i]*(include|define|ifn?def|if)[ ^i]+ * . main[ ^i\n]*( * . a declaration -- int, char, long, unsigned, static * in the first block of the file. * Can't be bothered today. */ if (LQU_StringContainedIn("#line", Buffer)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s contains #line and has been through cpp, index the original instead!", FileInfo->Name ); return FTYPE_C_SOURCE; } /* we are very predisposed to thinking of this as C... */ if (Prefix("#include", Buffer) || LQU_StringContainedIn("/*", Buffer) || LQU_StringContainedIn("#define", Buffer) || LQU_StringContainedIn("argc", Buffer) || LQU_StringContainedIn("()", Buffer) || LQU_StringContainedIn("#include", Buffer)) { return FTYPE_C_SOURCE; } } #endif /* FTYPE_C_SOURCE */ /* if still not done, choose between Don't Index and Ascii Filter * (which simply strips non-ascii characters). */ if (Type >= LQT_MaxFilterType(db)) { register char *p; int OtherCount = 0; Type = FTYPE_DEFAULT; for (p = Buffer; p - Buffer < AmountRead; p++) { if (!*p) { /* If it has nulls in it, it isn't a normal file, * and we have no idea what to do with it! * (if we did know, it would have had a magic number, * so we wouldn't have got here) */ LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s seems to be a binary file, contains NULs -- not indexed", FileInfo->Name ); Type = (-1); break; } if (!isascii(*p)) { OtherCount++; } } if (Type > 0) { if (OtherCount < (p - Buffer) / 5) { #ifdef FTYPE_MOSTLYASCII Type = (OtherCount) ? FTYPE_MOSTLYASCII : 0; #else Type = 0; #endif } else { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s seems to be a binary file -- not indexed", FileInfo->Name ); Type = (-1); /* too much garbage */ } } } if (Type > LQT_MaxFilterType(db)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s unknown file type %d -- not indexed", FileInfo->Name, Type ); Type = -1; /* don't index */ } return Type; }