/* readword.c -- Copyright 1993, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* readword -- read the next word of input * Liam Quin, August 1993 and later... * (but Taken from lqaddfile.c, written in 1989....) * * $Id: readword.c,v 1.26 2001/05/31 03:50:13 liam Exp $ */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #include #include #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "liblqtext.h" #include "lqtrace.h" #include "filter.h" /** System calls and library routines used in this file: **/ /** System calls: **/ /** Library Functions: **/ /** Functions within this file that need declaring: **/ PRIVATE t_WordInfo *LQTp_ReadWord( # ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *Stream, unsigned int Flags # endif ); PRIVATE void NewFile( # ifdef HAVE_PROTO t_LQTEXT_Database *db, t_FID FID #endif ); /**/ static int LastChar = 0; static char *ThisWord = 0; static char *ThatWord = 0; static unsigned long BytesRead = 0; static int StringMode = 0; /* Actual character input -- dependent on StringMode; * If we are reading a string, we advance the pointer; if we are * reading a file, we use getchar. In either case, we keep careful * track of the number of bytes read. * * Note that there's no ungetc here -- if it is needed, it will have * to decrement BytesRead, of course. * * Finally, note that when we get to the end of a string, we have to * make GetChar() return EOF. */ #define GetChar(Stream) \ ((StringMode) ? \ (* (char **) (Stream) >= EndPointer) ? \ EOF : (++BytesRead, *((*(char **) (Stream))++)) \ : \ (++BytesRead, getc(Stream))) static char *WordStart = 0; static char *EndPointer = 0; /* * LQT_ReadWordFromStringPointer * Database/Retrieval, Database/Documents * *

Returns the next natural-language word from the given * NUL-terminated string.

*

The definition of a word for the purpose of this routine is * determined partly by the definitions for LQT_StartsWord, * LQT_OnlyWithinWord and LQT_EndsWord in the header file * wordrules.h, and partly on the configuration * file in the database directory, where indexnumbers, minwordlength * and maxwordlength may be set.

*

If the arguments are all null, the effect is to reset * the routine ready to start a new string, and no useful value is * returned in that case.

*

The given Flags argument may either be zero or any combination * of LQT_READWORD_IGNORE_COMMON and LQT_READWORD_WILDCARDS, or'd * together.

*

Characters are read from the string, incrementing *Stringpp as each * byte is processed, until a recognised word is found. * If the LQT_READWORD_IGNORE_COMMON flag was set in Flags, * LQT_ReadWordFromStringPointer continues until either a word is * found that has not been registered as being too common to index, * or the end of the string is reached.

*

If Startp is not a NULL pointer, *Startp is set to point to * the first character in the word that has been found in the given * *Stringpp (not to the malloc'd copy in the result).

*

If Endp is a NULL pointer, the string is considered to be * terminated by the first zero byte reached; otherwise, Endp must * point to the first character not in the string; normally, Endp * would be set to point to the terminating NUL byte.

*

If the LQT_READWORD_WILDCARDS flag is set, the `Wild Card' * characters * and ? are allowed within words. Such characters * do not count as punctuation for the returned WordInfo flags.

* * the next WordInfo on success, or zero if there are no more words * to read in the string. * *

All client programs and library routines which parse words * use this routine or the companion LQT_ReadWordFromFileInfo routine. * This is very important, because lq-text relies on word counts * within each block of text to be the same on retrieval as they * were on indexing, and if different routines parsed the data each * time there would be a chance of discrepancies.

* * The interface to this routine is somewhat ugly, and may be changed * in the next release with the addition of a Reset routine and a * block offset counter. *
*/ API t_WordInfo * LQT_ReadWordFromStringPointer(db, Stringpp, Startp, Endp, Flags) t_LQTEXT_Database *db; char **Stringpp; char **Startp; CONST char *Endp; unsigned int Flags; { t_WordInfo *Result; if (!Stringpp) { NewFile(db, (t_FID) 0); return (t_WordInfo *) 0; } if (!*Stringpp) { Error(E_FATAL|E_INTERNAL, "LQT_ReadWordFromStringPointer: Stringpp points to null" ); } if (Endp) { EndPointer = Endp; } else if (!EndPointer) { EndPointer = (*Stringpp); while (*EndPointer) { ++EndPointer; } } StringMode = 1; if (Startp) { WordStart = (*Stringpp); } Result = LQTp_ReadWord(db, (FILE *) Stringpp, Flags); StringMode = 0; if (Startp) { *Startp = WordStart; } return Result; } /* use two static storage areas so we can be called twice in a row. * This is necessary to implement the WPF_LASTINBLOCK flag. */ static t_WordInfo This, That; static unsigned long WordInBlock = ~0L; /* Flags are kept in two ways: * there can be a flag bit pending that says that the next word * succesfully read (if any) from the current file will have that bit set; * there can also be a bit set that will go onto the previous word read. * This last is the reason that we need two static WordInfo structs, so * that we can set a flag bit on the previous one before it is stored. */ static t_WordFlags PendingFlagsForNextWord = 0; static t_WordFlags PendingFlagsForThisWord = 0; static long LastPos = 0L; static int BlockInFile = 0L; static unsigned long LastBlock; /* * LQT_ReadWordFromFileInfo * Database/Retrieval, Database/Documents * * The same as LQT_ReadWordFromStringPointer, but uses a FILE * that * the caller has created in the given t_FileInfo structure. * * See LQC_MakeInput in the lqaddfile client for one way to create * a FileInfo; that routine will move into the API in a future release, * but probably with slight changes to its interface. * * LQT_ReadWordFromStringPointer * */ API t_WordInfo * LQT_ReadWordFromFileInfo(db, FileInfo, Flags) t_LQTEXT_Database *db; t_FileInfo *FileInfo; unsigned int Flags; { static t_FID LastFid = ~(t_FID)0; if (FileInfo->FID != LastFid) { LastFid = FileInfo->FID; NewFile(db, LastFid); } return LQTp_ReadWord( db, FileInfo->Stream, Flags ); } PRIVATE void NewFile(db, FID) t_LQTEXT_Database *db; t_FID FID; { if (!ThisWord) { #define WORD_SLOP 10 /* enough to contain the longest plural string */ ThisWord = emalloc("ReadWord:1", db->MaxWordLength + WORD_SLOP + 2); ThatWord = emalloc("ReadWord:2", db->MaxWordLength + WORD_SLOP + 2); /* NOTE: LQT_WORDROOT may extend a word by up to two characters. */ } This.WID = (t_WID) 0; This.WordPlaces = (t_WordPlace *) 0; This.DataBlock = (unsigned char *) 0; This.WordPlacesInHere = 0; This.WordPlace.FID = This.FID = FID; That = This; /* structure copy */ /* the two structures differ in having pointers to two different * string buffers, so we have to restore the pointers: */ This.Word = ThisWord; That.Word = ThatWord; WordInBlock = (unsigned long) -1; /* none, yet! */ LastPos = BlockInFile = LastBlock = 0L; BytesRead = 0L; PendingFlagsForNextWord = 0L; PendingFlagsForThisWord = 0L; /* end pointer for EOF detection when reading a string: */ EndPointer = 0; } PRIVATE t_WordInfo * LQTp_ReadWord(db, Stream, Flags) t_LQTEXT_Database *db; FILE *Stream; unsigned int Flags; { static int ThisOrThat = 0; register char *q; t_WordInfo *WordInfo; char *Buffer; char *EndOfBuffer; int ch; unsigned long Start; int WildCardWord = 0; int SkippedChars = 0; if (ThisOrThat) { WordInfo = &This; WordInfo->Word = ThisWord; } else { WordInfo = &That; WordInfo->Word = ThatWord; } /* We loop until we have reached EOF, or until we have read a complete * word that can be returned. * Depending on the value of our Flags argument, we may actually read * any number of words that we discard because they are in the stoplist. */ for (;;) { q = Buffer = WordInfo->Word; /* The next returned word gets any flags that related to * stuff after the previous word, e.g. if the last character read * was punctuation. * We don't carry over the Uppercase flag, though, that's only * set if the word itself is upper case. Other such flags include * the ones set by LQT_WORDROOT, but they don't get put into * PendingFlag in the first place. */ PendingFlagsForThisWord |= PendingFlagsForNextWord; PendingFlagsForNextWord = 0L; PendingFlagsForThisWord &= ~(t_WordFlags) (WPF_UPPERCASE|WPF_POSSESSIVE|WPF_WASPLURAL); /* Skip non-word characters */ for (;;) { ch = GetChar(Stream); EOFtop: if (ch == EOF) { unsigned long FlagsForLastWord; /* Compute any additional flags needed for the previous * word, and then return EOF... */ /* You could argue that we should set the LASTINBLOCK flag, * since we have reached EOF and there are no more words. * The only use for that flag is to allow a phrase to * match over a block boundary. But a phrase can't really * be expected to match over an EOF boundary! * * None the less, we set the flag to be consistant: */ FlagsForLastWord = WPF_LASTINBLOCK; /* Now add in any other flags: */ if (PendingFlagsForThisWord & WPF_LASTHADPUNCT) { FlagsForLastWord |= WPF_NEXTHASPUNCT; } if (PendingFlagsForThisWord & WPF_LASTWASCOMMON) { FlagsForLastWord |= WPF_NEXTISCOMMON; } if (FlagsForLastWord) { if (WordInfo == &This) { That.Flags |= FlagsForLastWord; } else { This.Flags |= FlagsForLastWord; } } #ifdef ASCIITRACE LQT_Trace(LQTRACE_READWORD, "EOF"); #endif return (t_WordInfo *) 0; } if (ch == LQT_CHAR_TO_SKIP) { SkippedChars++; continue; } if (LQT_STARTS_WORD(db, ch)) { break; } else if (LQT_ISDIGIT(db, ch)) { /* In this case, if a digit is not a word start character, * we are not indexing words. * * Hence, a number is something to skip. * We have to skip it all in one go, though, as otherwise * if we treated 0 as punctuation, 0xFF would be indexed * as xFF (just like -xFF is now, correctly). * Also, 3.14159 is only one `word'... * (or is it?) * On the other hand, tc|wyse300|xx| includes "wyse300" * or "wyse" (depending on whether digits are allowed * inside words), but not "tc|wyse300|xx|". */ Start = BytesRead - 1; if (StringMode) { WordStart = (*(char ** )Stream) - 1; } /* Note: we don't allow ' inside a number. * A number is a digit followed by any number * of letters and digits. * * 15.5e+601 is treated as three words: * 15.5e * 601 * This isn't optimal, but it's hard to recognise floating * point numbers well, and we also want to treat * 15e6+301 * as two numbers (16#15E6 and 301). * A . separates the words, so that multiple dots are * treated correctly, e.g. as in "1992...today". */ do { if ((ch = GetChar(Stream)) == EOF) { goto EOFtop; } } while (LQT_ISDIGIT(db, ch) || LQT_ENDS_WORD(db, ch)); } /* If we are about to skip punctuation, the next word read, * which will be "This", will have the LAST_HAD_PUNCT flag: */ if (LQT_ISPUNCT(db, ch)) { if ((Flags & LQT_READWORD_WILDCARDS) && (ch == '*' || ch == '?') ) { WildCardWord = 1; } else { PendingFlagsForThisWord |= WPF_LASTHADPUNCT; } } } /* for-loop */ /* ASSERT: we have read at least one non-EOF character... */ Start = BytesRead - 1; Start -= SkippedChars; SkippedChars = 0; EndOfBuffer = &Buffer[db->MaxWordLength + WORD_SLOP - 1]; if (StringMode) { WordStart = (*(char ** )Stream) - 1; } /* Here we start reading the actual word. * We already have the first character in ch. */ for (;;) { PendingFlagsForThisWord |= LQT_ISUPPER(db, ch); /* Note: IS_UPPER returns 0 or WPF_UPPERCASE; * LQT_TOLOWER is efficient enough a macro that it's * not worth checking if the character is upper case * or not -- if it isn't, we get the actual character. */ LastChar = LQT_TOLOWER(db, ch); /* Truncate if necessary: */ if (q < EndOfBuffer) { *q++ = LastChar; } if ((ch = GetChar(Stream)) == EOF) { break; } if (LQT_ENDS_WORD(db, ch)) { continue; /* OK, still inside the word */ } else if (LQT_ONLY_WITHIN_WORD(db, ch)) { if (LQT_ENDS_WORD(db, LastChar)) { continue; /* OK, e.g. ch is ' in can't, t will be next */ } else { /* e.g. just seen 2nd ' in boy'' at end of quotation. * Reject both this and the previous character: */ /* reject previous character: * Note: we have at least 2 chars in the buffer. * * We make sure the last character in the buffer is * the same as the last character read before eliding it: */ if (q <= EndOfBuffer && q[-1] == LastChar) { if (LQT_ISPUNCT(db, LastChar)) { PendingFlagsForThisWord |= WPF_NEXTHASPUNCT; } *--q = '\0'; } break; } } else if ((Flags & LQT_READWORD_WILDCARDS) && (ch == '?' || ch == '*') ) { WildCardWord = 1; continue; } else { /* Not a character allowed anywhere within a word, * and not a wildcard character either. */ if (LQT_ONLY_WITHIN_WORD(db, LastChar)) { /* reject the previous character: */ if (q <= EndOfBuffer && q[-1] == LastChar) { if (LQT_ISPUNCT(db, LastChar)) { PendingFlagsForThisWord |= WPF_NEXTHASPUNCT; } *--q = '\0'; } } break; } } /* end for(;;) */ WordInfo->Length = q - Buffer; *q = '\0'; if (ch == EOF) { ch = 0; } else if (LQT_ISPUNCT(db, ch)) { if ((Flags & LQT_READWORD_WILDCARDS) && (ch == '*' || ch == '?') ) { WildCardWord = 1; } else { PendingFlagsForThisWord |= WPF_NEXTHASPUNCT; } } if (WordInfo->Length < db->MinWordLength) { register char *p; if (db->WordFlags & (WPF_LASTHADLETTERS & WPF_LASTHADPUNCT)) { for (p = Buffer; p < q; p++) { if (LQT_ISALPHA(db, *p)) { PendingFlagsForThisWord |= WPF_LASTHADLETTERS; } else if (LQT_ISPUNCT(db, *p)) { if ((Flags & LQT_READWORD_WILDCARDS) && (ch == '*' || ch == '?') ) { WildCardWord = 1; } else { PendingFlagsForThisWord |= WPF_LASTHADPUNCT; } } } } if (PendingFlagsForThisWord & WPF_NEXTHASPUNCT) { /* We read one extra character after the end of the last * word, and it turned out to be punctuation. So if * we had accepted that word, there would have been * punctuation after it. But we rejected it. Hence, that * punctuation will now be before the next word read... */ PendingFlagsForThisWord &= ~WPF_NEXTHASPUNCT; PendingFlagsForThisWord |= WPF_LASTHADPUNCT; } continue; } /* We have to do this now to get common words right, * but we've left it as late as possible in order to hope to * avoid doing the relatively expensive division: */ BlockInFile = Start / LQT_FileBlockSize(db); if (BlockInFile != LastBlock) { LastBlock = BlockInFile; WordInBlock = (unsigned long) (-1); } WordInfo->WordPlace.Flags = 0; if (!WildCardWord) { if (WordInfo->Length < db->MaxWordLength + WORD_SLOP) { (void) LQT_WORDROOT(db, WordInfo); } } if (WordInfo->Length > db->MaxWordLength) { WordInfo->Length = db->MaxWordLength; WordInfo->Word[db->MaxWordLength] = '\0'; } #if POSSIBLE_FUTURE_CHANGE /* Currently, LQT_WORDROOT never does this: */ if (WordInfo == &This) { if (WordInfo->Word != ThisWord) { /* it was grown by LQT_WORDROOT() */ ThisWord = WordInfo->Word; } } else { if (WordInfo->Word != ThatWord) { /* it was grown by LQT_WORDROOT() */ ThatWord = WordInfo->Word; } } #endif if (WordInfo->Word[0] == LQT_CHAR_TO_IGNORE || WordInfo->Word[0] == LQT_DIGIT_TO_IGNORE ) { PendingFlagsForThisWord |= WPF_LASTWASCOMMON; WordInBlock++; continue; } else if (Flags & LQT_READWORD_IGNORE_COMMON) { if ((db->CommonWordsHigh || db->CommonWordsLow) && LQT_WordIsInStopList(db, WordInfo) ) { PendingFlagsForThisWord |= WPF_LASTWASCOMMON; WordInBlock++; continue; } } /* Now we have got a word to return... */ { t_WordFlags FlagsForThisWord; t_WordFlags FlagsForLastWord = 0; FlagsForThisWord = PendingFlagsForThisWord; if (PendingFlagsForThisWord & WPF_LASTHADPUNCT) { FlagsForLastWord |= WPF_NEXTHASPUNCT; } if (PendingFlagsForThisWord & WPF_NEXTHASPUNCT) { PendingFlagsForNextWord |= WPF_LASTHADPUNCT; } if (PendingFlagsForThisWord & WPF_LASTWASCOMMON) { FlagsForLastWord |= WPF_NEXTISCOMMON; } PendingFlagsForThisWord &= db->WordFlags; WordInfo->WordPlace.Flags |= FlagsForThisWord; PendingFlagsForThisWord = 0L; if (WordInfo->WordPlace.Flags & WPF_NEXTHASPUNCT) { PendingFlagsForThisWord |= WPF_LASTHADPUNCT; } /* If the previous word was followed by punctuation or * by a common word, * say so here: */ FlagsForLastWord &= db->WordFlags; if (FlagsForLastWord) { if (WordInfo == &This) { That.Flags |= FlagsForLastWord; That.WordPlace.Flags |= FlagsForLastWord; } else { This.Flags |= FlagsForLastWord; This.WordPlace.Flags |= FlagsForLastWord; } } /* StuffBefore is the number of characters between the end of the * last word and the start of this one. */ if (Start > 1L) { if (Start - LastPos <= 0) { /* save a byte in the index */ WordInfo->WordPlace.StuffBefore = 1; } else if (Start - (LastPos + 1) >= 077) { /* We are going to use the top two bits for flags, * so we can't store a larger number than this. * Actually, that's OK, because users are unlikely to * type 100 blanks in a query.... the time it's likely * is if there are several common words in a row, but * even there 63 seems plenty, and I might reduce it * to 31 later to add another flag. * 16 is getting a bit small, though, as two or three * common (stop) words in a row could easily be that long. */ WordInfo->WordPlace.StuffBefore = 077; } else { WordInfo->WordPlace.StuffBefore = (Start - LastPos); } } else { WordInfo->WordPlace.StuffBefore = 1; /* i.e., the default */ } WordInfo->WordPlace.WordInBlock = (++WordInBlock); WordInfo->WordPlace.BlockInFile = BlockInFile; WordInfo->Word[WordInfo->Length] = '\0'; LastPos = BytesRead - 1; ThisOrThat = !ThisOrThat; /* toggle between 0 and 1. Boring life, really */ break; } } /* forever */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_READWORD)) { LQT_Trace(LQTRACE_READWORD, "%s/%d flags 0%o %s", WordInfo->Word, WordInfo->Length, WordInfo->WordPlace.Flags, LQT_WordFlagsToString(db, (t_WordFlags) WordInfo->WordPlace.Flags) ); } #endif return WordInfo; }