/* query.c -- Copyright 1996 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* * Match a query including wildcard support * * $Id: query.c,v 1.14 2001/05/31 03:50:13 liam Exp $ * */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #ifndef FILE # include /* stderr, also for fileinfo.h */ #endif #include #ifdef HAVE_SYSV_FCNTL_H # include #endif #ifdef HAVE_FCNTL_H #include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include #ifdef HAVE_STRING_H # include #else # include #endif #include "emalloc.h" /* for efree() */ #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "wordrules.h" #include "pblock.h" #include "phrase.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" /** **/ #define QT_NORMAL 00 #define QT_WILDCARD 01 #define TermIsPattern(QT) ((QT)->flags & QT_WILDCARD) #define QT_EXPANDED 02 #define TermIsExpanded(QT) ((QT)->flags & QT_EXPANDED) typedef struct { /* this is internal to query.c and can't be accessed except * through the API. */ t_QueryItem *Items; int NumberOfTerms; int NumberOfWordsAfterExpansion; int HasUnknownWords; /* could be flags byte */ } t_QueryData; API t_ExpandedQueryList * LQT_ExpandQueryItem(db, Item) t_LQTEXT_Database *db; t_QueryItem *Item; { t_ExpandedQueryList *Expansion = 0; t_ExpandedQueryList **Expansionp = &Expansion; t_WID WID; t_WID MaxWID; unsigned char *Pattern; int PatternLength; int PrefixLength; long totalWords = 0; unsigned char *PatternData; LQT_Trace(LQTRACE_TERM_EXPANSION, "Expand %*.*s\n", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word ); if (!TermIsPattern(Item)) { Error(E_WARN, "LQT_ExpandQueryItem called for non-pattern \"%*.*s\"", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word ); return (t_ExpandedQueryList *) NULL; } if (TermIsExpanded(Item)) { LQT_Trace(LQTRACE_TERM_EXPANSION, "Expand %*.*s: already expanded\n", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word ); return Item->Expansion; } MaxWID = LQT_GetMaxWID(db); Pattern = (unsigned char *) Item->WordInfo->Word; PatternLength = Item->WordInfo->Length; PatternData = LQT_PrepareWildCardForMatching( db, Pattern, (int) Item->WordInfo->Length, &PrefixLength ); /* mark as expanded: */ Item->flags |= QT_EXPANDED; /* find the first word in widfile matching the given pattern */ WID = LQT_FindFirstWIDMatchingPattern( db, Pattern, PatternLength, PrefixLength, LQT_MatcherForWildCards, PatternData ); if (WID == 0L) { /* no matches at all */ LQT_FinishWildCardAfterMatching(db, PatternData); LQT_Trace(LQTRACE_TERM_EXPANSION, "Expand %*.*s: no match.\n", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word ); return (t_ExpandedQueryList *) 0; } /* step through all of them */ do { t_WordInfo *WP = LQT_WIDToWordInfo(db, WID); WP = LQT_WIDToWordInfo(db, WID); if (!WP) { continue; } if (WP->NumberOfWordPlaces == 0) { LQT_DestroyWordInfo(db, WP); continue; } /* got a plausible entry */ *Expansionp = (t_ExpandedQueryList *) emalloc("query term", sizeof(t_ExpandedQueryList) ); (*Expansionp)->Next = (t_ExpandedQueryList *) NULL; (*Expansionp)->WordInfo = LQT_WIDToWordInfo(db, WID); (*Expansionp)->NumberOfOccurrences = (*Expansionp)->WordInfo->NumberOfWordPlaces; ++totalWords; if (LQT_TraceFlagsSet(LQTRACE_TERM_EXPANSION)) { char *w = LQT_WIDToWord(db, WID); if (!w) { Error(E_FATAL, "LQT_WIDToWord(%ld) failed", WID); } LQT_Trace(LQTRACE_TERM_EXPANSION, "Expand %*.*s: %d: WID %ld: \"%s\"\n", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word, totalWords, WID, w ); } if (Item->WordInfo->Flags || Item->WordInfo->WordPlace.Flags) { /* how could this happen? */ Item->WordInfo->Flags = 0; Item->WordInfo->WordPlace.Flags = 0; } /* TODO: if (too many occurrences) die */ Expansionp = &((*Expansionp)->Next); WID = LQT_FindNextWIDMatchingPattern( db, WID, Pattern, PatternLength, PrefixLength, LQT_MatcherForWildCards, PatternData ); } while (WID != 0 && WID < MaxWID); LQT_FinishWildCardAfterMatching(db, PatternData); Item->totalMatches = totalWords; LQT_Trace(LQTRACE_TERM_EXPANSION, "Expand %*.*s: total different words matching pattern: %ld\n", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word, totalWords ); return Expansion; } /* * LQT_ParseQuery * Retrieval/Matching, Retrieval/Phrases * *

Parses the given string, and returns a Query object. * * The new Query object, or LQT_BadQuery on error. * * LQT_StringToPhrase * */ API t_LQT_Query * LQT_ParseQuery(db, theString) t_LQTEXT_Database *db; char *theString; { t_LQT_Query *theResult = (t_LQT_Query *) emalloc( "new query", sizeof(t_LQT_Query) ); char *Start, *End; int isPattern; t_WordInfo *WordInfo; t_WordInfo *LastWord = 0; t_QueryData *theData; t_QueryItem **Itempp, *theItem = 0; int queryLength; /* reset the word reader: */ (void) LQT_ReadWordFromStringPointer( db, (char **) NULL, (char **) NULL, (char *) NULL, 0 ); queryLength = strlen(theString); Start = theResult->Query = emalloc(theString, queryLength + 2); (void) strcpy(Start, theString); End = &Start[queryLength]; theResult->internalData = (unsigned char *) emalloc( "query data", sizeof(t_QueryData) ); theData = (t_QueryData *) theResult->internalData; theData->Items = (t_QueryItem *) 0; theData->HasUnknownWords = 0; Itempp = &(theData->Items); /* Strategy: * add words, one at a time, but looking for * A trailing * or ? after a word --> wildcard * */ for (;; LastWord = WordInfo) { isPattern = 0; WordInfo = LQT_ReadWordFromStringPointer( db, &Start, (char **) NULL, End, LQT_READWORD_IGNORE_COMMON|LQT_READWORD_WILDCARDS ); if (WordInfo == (t_WordInfo *) NULL) { break; } if (!LastWord) { continue; } /* mark the word as a wildcard if appropriate */ { register char *p; for (p = LastWord->Word; p - LastWord->Word < LastWord->Length; p++ ) { if (*p == '*' || *p == '?') { isPattern = 1; break; } } } if (isPattern) { LQT_Trace(LQTRACE_TERM_EXPANSION, "ParseQuery: wildcard term: %*.*s, F %s", LastWord->Length, LastWord->Length, LastWord->Word, LQT_WordFlagsToString(db, LastWord->WordPlace.Flags) ); theItem = *Itempp = (t_QueryItem *) emalloc( Start, sizeof(t_QueryItem) ); (*Itempp)->Expansion = 0; (*Itempp)->Next = 0; (*Itempp)->totalMatches = 0; (*Itempp)->flags = QT_WILDCARD; (*Itempp)->WordInfo = LQT_MakeWordInfo( db, (t_WID) -1, (int) LastWord->Length, (unsigned char *) LastWord->Word ); (*Itempp)->WordInfo->Flags = LastWord->WordPlace.Flags; ((*Itempp)->WordInfo->WordPlace) = LastWord->WordPlace; /*struct copy*/ theItem->Expansion = LQT_ExpandQueryItem(db, theItem); Itempp = &(*Itempp)->Next; } else { LQT_Trace(LQTRACE_TERM_EXPANSION, "ParseQuery: constant term: %*.*s", LastWord->Length, LastWord->Length, LastWord->Word ); LastWord->WID = LQT_WordToWID(db, LastWord->Word, LastWord->Length); if (LastWord->WID > 0) { t_WordInfo *W = LQT_WIDToWordInfo(db, LastWord->WID); if (W == (t_WordInfo *) 0) { theData->HasUnknownWords++; /* Note: if we're doing precise matching, we could * actually give up here... */ } else { W->WordPlace = LastWord->WordPlace; /* struct copy */ W->WordPlace.Flags = W->Flags; *Itempp = (t_QueryItem *) emalloc( Start, sizeof(t_QueryItem) ); (*Itempp)->Expansion = 0; (*Itempp)->Next = 0; (*Itempp)->totalMatches = 0; (*Itempp)->flags = 0; (*Itempp)->WordInfo = W; (*Itempp)->totalMatches = W->NumberOfWordPlaces; Itempp = &(*Itempp)->Next; } } else { theData->HasUnknownWords++; continue; } } } if (LastWord) { /* mark the word as a wildcard if appropriate */ register char *p; for (p = LastWord->Word; p - LastWord->Word < LastWord->Length; p++ ) { if (*p == '*' || *p == '?') { isPattern = 1; break; } } if (isPattern) { LQT_Trace(LQTRACE_TERM_EXPANSION, "ParseQuery: wildcard term: %*.*s", LastWord->Length, LastWord->Length, LastWord->Word ); theItem = *Itempp = (t_QueryItem *) emalloc( Start, sizeof(t_QueryItem) ); (*Itempp)->Expansion = 0; (*Itempp)->Next = 0; (*Itempp)->totalMatches = 0; (*Itempp)->flags = QT_WILDCARD; (*Itempp)->WordInfo = LQT_MakeWordInfo( db, (t_WID) -1, (int) LastWord->Length, (unsigned char *) LastWord->Word ); (*Itempp)->WordInfo->Flags |= LastWord->Flags; ((*Itempp)->WordInfo->WordPlace) = LastWord->WordPlace; /*struct copy*/ theItem->Expansion = LQT_ExpandQueryItem(db, theItem); Itempp = &(*Itempp)->Next; } else { LQT_Trace(LQTRACE_TERM_EXPANSION, "ParseQuery: constant term: %*.*s", LastWord->Length, LastWord->Length, LastWord->Word ); LastWord->WID = LQT_WordToWID(db, LastWord->Word, LastWord->Length); if (LastWord->WID > 0) { t_WordInfo *W = LQT_WIDToWordInfo(db, LastWord->WID); if (W == (t_WordInfo *) 0) { theData->HasUnknownWords++; /* Note: if we're doing precise matching, we could * actually give up here... */ } else { W->WordPlace = LastWord->WordPlace; /* struct copy */ W->WordPlace.Flags |= W->Flags; *Itempp = (t_QueryItem *) emalloc( Start, sizeof(t_QueryItem) ); (*Itempp)->Expansion = 0; (*Itempp)->Next = 0; (*Itempp)->totalMatches = 0; (*Itempp)->flags = 0; (*Itempp)->WordInfo = W; (*Itempp)->totalMatches = W->NumberOfWordPlaces; Itempp = &(*Itempp)->Next; } } else { theData->HasUnknownWords++; } } } /* if we've got anything, return it */ return theResult; } static t_FID *FilesInShortestEntry = 0; static t_FID *FilesInShortestEntryPointer; PRIVATE int ReadOnlyAcceptWordFunc(db, WID, WordPlace) t_LQTEXT_Database *db; t_WID WID; t_WordPlace *WordPlace; { while (*FilesInShortestEntryPointer < WordPlace->FID) { /* not there yet */ if (*FilesInShortestEntryPointer == (t_FID) 0) { return 0; } ++FilesInShortestEntryPointer; } if (*FilesInShortestEntryPointer > WordPlace->FID) { /* gone past it */ return 0; } else if (*FilesInShortestEntryPointer == WordPlace->FID) { return 1; } else { /* got to the end! */ return 0; } } PRIVATE t_WordInfo * MakeFakeWordInfoForPattern(db, Item) t_LQTEXT_Database *db; t_QueryItem *Item; { t_WordInfo *Result = Item->WordInfo; t_ExpandedQueryList *eItem; unsigned long totalPlaces = 0L; t_WordPlace *Start; if (!TermIsPattern(Item)) { Error(E_WARN|E_INTERNAL, "MakeFakeWordInfoForPattern called for non-pattern \"%*.*s\"", Item->WordInfo->Length, Item->WordInfo->Length, Item->WordInfo->Word ); return Item->WordInfo; } if (!TermIsExpanded(Item)) { Item->Expansion = LQT_ExpandQueryItem(db, Item); } for (eItem = Item->Expansion; eItem; eItem = eItem->Next) { t_WordInfo *W; W = eItem->WordInfo; /* fetch the wordplaces */ if (W->WordPlacesInHere != W->NumberOfWordPlaces) { if (FilesInShortestEntry) { FilesInShortestEntryPointer = FilesInShortestEntry; W->WordPlaces = LQT_GetWordPlacesWhere( db, W->WID, W->WordPlaceStart, (unsigned)WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), W->Offset, &W->NumberOfWordPlaces, ReadOnlyAcceptWordFunc ); } else { W->WordPlaces = LQT_GetWordPlaces( db, W->WID, W->WordPlaceStart, (unsigned)WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), W->Offset, &W->NumberOfWordPlaces ); } } W->Flags = W->WordPlace.Flags = 0; /* There was a bug that made flags non-zero and that * I have not tracked down yet -- need to use codecenter. */ totalPlaces += W->NumberOfWordPlaces; } Start = Result->WordPlaces = (t_WordPlace *) emalloc("new places for query", sizeof(t_WordPlace) * totalPlaces ); for (eItem = Item->Expansion; eItem; eItem = eItem->Next) { /* append the results. We can use bcopy. This is fun. */ bcopy( eItem->WordInfo->WordPlaces, Start, sizeof(t_WordPlace) * eItem->WordInfo->NumberOfWordPlaces ); Start = &Start[eItem->WordInfo->NumberOfWordPlaces]; } if (Start - Result->WordPlaces != totalPlaces) { Error(E_WARN|E_INTERNAL, "MakeFakeWordInfoForPattern: %ld != %ld, oops!??", Start - Result->WordPlaces, totalPlaces ); } Result->NumberOfWordPlaces = totalPlaces; Result->WordPlacesInHere = totalPlaces; Result->Flags = Result->WordPlace.Flags = 0; /* see above */ LQT_SortWordPlaces(db, totalPlaces, Result->WordPlaces); return Result; } API t_Phrase * LQT_QueryToPhraseKludge(db, Query) t_LQTEXT_Database *db; t_LQT_Query *Query; { t_Phrase *Result; t_PhraseItem **ThisWord; t_QueryData *theData; t_QueryItem *Item; t_QueryItem *Smallest = 0; unsigned long smallestCount = 0L; theData = (t_QueryData *) Query->internalData; Result = (t_Phrase *) emalloc("QueryToPhraseKludge", sizeof(t_Phrase)); Result->Next = (t_Phrase *) 0; Result->HasUnknownWords = theData->HasUnknownWords; Result->ModifiedString = Query->Query; *(ThisWord = &Result->Words) = (t_PhraseItem *) 0; /** prepass: find out if there is a fixed term, and, if so, how ** many matches it has **/ for (Item = theData->Items; Item; Item = Item->Next) { if (!TermIsPattern(Item)) { if (Smallest) { if (Item->totalMatches < smallestCount) { smallestCount = Item->totalMatches; Smallest = Item; } } else if (Item->totalMatches > 0) { Smallest = Item; smallestCount = Item->totalMatches; } } } if (FilesInShortestEntry) { efree( (char *) FilesInShortestEntry); FilesInShortestEntry = 0; } if (smallestCount > 0) { t_WordInfo *W = Smallest->WordInfo; /* less indirection! */ FilesInShortestEntry = (t_FID *) emalloc( "FIDS in Query", sizeof(t_FID) * (smallestCount + 1) /* +1 for a trailing sentinel */ ); /* check we have the postings: */ if (Smallest->WordInfo->WordPlacesInHere < Smallest->WordInfo->NumberOfWordPlaces ) { if (W->WordPlaces) { (void) efree((char *) W->WordPlaces); } W->WordPlaces = LQT_GetWordPlaces( db, W->WID, W->WordPlaceStart, (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), W->Offset, &W->NumberOfWordPlaces ); W->WordPlacesInHere = W->NumberOfWordPlaces; } { register int i; int f = 0; t_FID prev = 0; for (i = 0; i < W->NumberOfWordPlaces; i++) { if (W->WordPlaces[i].FID != prev) { prev = FilesInShortestEntry[f] = W->WordPlaces[i].FID; f++; } } FilesInShortestEntry[f] = (t_FID) 0; smallestCount = f; } } for (Item = theData->Items; Item; Item = Item->Next) { t_WordInfo *WordInfo; WordInfo = Item->WordInfo; if (!WordInfo) { Error(E_FATAL|E_INTERNAL, "Query Item has no WordInfo!"); } if (TermIsPattern(Item)) { int Flags = WordInfo->Flags; t_WordInfo *W = MakeFakeWordInfoForPattern(db, Item); if (W) { *ThisWord = (t_PhraseItem *) emalloc("Phrase Item", sizeof(t_PhraseItem)); W->WordPlace = WordInfo->WordPlace; /* struct copy */ W->WordPlace.Flags |= Flags; /* point to the new space */ (*ThisWord)->Word = W; (*ThisWord)->WordStart = 0; (*ThisWord)->Next = (t_PhraseItem *) 0; (*ThisWord)->SearchIndex = 0L; ThisWord = &(*ThisWord)->Next; } else { Result->HasUnknownWords++; } } else if (WordInfo->WID > 0 && WordInfo->NumberOfWordPlaces > 0) { /* normal word with wordinfo */ int Flags = WordInfo->Flags; t_WordInfo *W = Item->WordInfo; *ThisWord = (t_PhraseItem *) emalloc("Phrase Item", sizeof(t_PhraseItem)); W->WordPlace = WordInfo->WordPlace; /* struct copy */ W->WordPlace.Flags |= Flags; /* point to the new space */ (*ThisWord)->Word = W; (*ThisWord)->WordStart = 0; (*ThisWord)->Next = (t_PhraseItem *) 0; (*ThisWord)->SearchIndex = 0L; ThisWord = &(*ThisWord)->Next; } else { Result->HasUnknownWords++; } } /* for */ if (ThisWord == &Result->Words) { /* There were no words in the phrase! */ efree((char *) Result); return (t_Phrase *) 0; } Result->NumberOfMatches = 0; Result->Matches = (t_MatchList *) 0; return Result; }