/* common.c -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* * $Id: common.c,v 1.17 1996/08/14 16:53:05 lee Exp $ * * Handle Common Word file * */ #include "error.h" #include #include #include #include "globals.h" /* defines and declarations for database filenames */ #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" #ifdef HAVE_STRING_H # include #else # include #endif /** **/ typedef struct s_WordList { char *Word; struct s_WordList *Next; } t_WordList; static char FirstCharBitMap[16] = { 0, }; /* * LQT_WordIsInStopList * Language/Stop List * * Returns 1 if the given word is in the stop list, 0 otherwise. * This function is called by the LQT_ReadWord routines on each * input word to determine whether to return it. * * *
  • 1 if the word is in the stop list *
  • 0 otherwise * * * FirstCharBitMap is shared across all databases. * You cannot have more than one database open at a time anyway at * the moment, so this is not yet an issue... * */ API int LQT_WordIsInStopList(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { register char *Word = WordInfo->Word; register t_WordList **WP; if (WordInfo->Length > 0 && WordInfo->Word[0] == LQT_CHAR_TO_IGNORE) { return 1; } WP = (WordInfo->Word[0] < 'm') ? (t_WordList **) &db->CommonWordsLow : (t_WordList **) &db->CommonWordsHigh; if (!WP || (FirstCharBitMap[WordInfo->Word[0] >> 5] & (01 << (WordInfo->Word[0] & 07))) == 0) { return 0; } for (; *WP; WP = &(*WP)->Next) { int i = STRCMP((*WP)->Word, Word); if (i == 0) return 1; /* yes, it's common */ else if (i > 0) return 0; } return 0; } /* * LQT_ReadStopList * Language/Stop List * * Reads the named file, and adds any words found in it * to the in-memory stop list, to be ignored by LQT_ReadWord. * * *
  • the number of words added on success; *
  • -1 if the file couldn't be opened. * * * Warns if the file can't be opened. * * LQT_WordIsInStopList * * There is no way to clear the stop list; you can only add to it. * The current implementation is inefficient if there are more than * ten or so words. * * A future release may support a `go list' of phrases every word of * which is to be indexed. * */ API int LQT_ReadStopList(db, CommonFile) t_LQTEXT_Database *db; CONST char *CommonFile; { FILE *fd; char Buffer[1024]; t_WordInfo W; char *Root; char *Word; fd = LQU_fEopen(E_WARN, CommonFile, "common word list", "r"); if (fd == (FILE *) NULL) { return -1; } /* We guess a suitable starting size for a buffer to hold a Word; * if this is too small, we dump core right now! * * TODO: use ReadLine and ReadWord... */ Word = emalloc("stopword", (unsigned int) (db->MaxWordLength * 2 + 4)); while (fgets(Buffer, sizeof(Buffer), fd) != (char *) 0) { register char *p; char *Start; for (p = Buffer; *p; p++) { if (*p == '#') break; if (LQT_StartsWord(db, *p)) break; } if (*p == '#' || !*p) { continue; } Start = p; for (; *p; p++) { if (!LQT_EndsWord(db, *p)) { if (LQT_OnlyWithinWord(db, *p) && LQT_EndsWord(db, p[1])) { continue; } break; } } if (p - Start + 1 < db->MinWordLength) { continue; } *p = '\0'; /* delete trailing \n or whatever */ (void) strcpy(Word, Start); W.Word = Word; W.Length = p - Start; /* length excludes the \0 */ W.Flags = 0; W.WordPlace.Flags = 0; Root = LQT_WORDROOT(db, &W); LQT_InsertCommonWord(db, Root); } (void) efree(Word); (void) fclose(fd); #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_READWORD)) { t_WordList *WP; for (WP = (t_WordList *) db->CommonWordsLow; WP; WP = WP->Next) { fprintf(stderr, "%s: Ignore \"%s\"\n", CommonFile, WP->Word); } for (WP = (t_WordList *) db->CommonWordsHigh; WP; WP = WP->Next) { fprintf(stderr, "%s: Ignore \"%s\"\n", CommonFile, WP->Word); } } #endif /*ASCIITRACE*/ return 0; } /* * LQT_InsertCommonWord * Language/Stop List * * The given word will be ignored by LQT_ReadWord. * Note that if you ignore different words on retrieval than on indexing, * lq-text will not be able to locate the exact text of matches, and * phrase matching may have unexpected results. * You should therefore not modify the stoplist once you have * created an index. * * The common list is shared by all lq-text databases. * There is no way to remove a word from the stoplist. * * LQT_ReadStopList, LQT_WordIsInStopList * */ /*ARGSUSED2*/ API void LQT_InsertCommonWord(db, Root) t_LQTEXT_Database *db; char *Root; { register t_WordList **WP; t_WordList *W; if (Root[0] == LQT_CHAR_TO_IGNORE) { return; } WP = (Root[0] < 'm') ? (t_WordList **) &db->CommonWordsLow : (t_WordList **) &db->CommonWordsHigh; for (; *WP; WP = &(*WP)->Next) { int i = STRCMP((*WP)->Word, Root); if (i == 0) return; else if (i > 0) break; } /* insert it before this one! */ W = (*WP); (*WP) = (t_WordList *) emalloc("Common Word Entry", sizeof(t_WordList)); (*WP)->Word = emalloc("Common Word", strlen(Root) + 1); (void) strcpy((*WP)->Word, Root); (*WP)->Next = W; /* set the corresponding bit in the bitmap */ FirstCharBitMap[Root[0] >> 5] |= (01 << (Root[0] & 07)); return; }