/* Phrase.c -- Copyright 1989, 1994, 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* * Deal with (WID, FID, Offfset) triples * Liam Quin, September 1989 * * $Id: phrase.c,v 1.39 96/07/04 20:15:29 lee Exp $ * */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #include /* stderr, also for fileinfo.h */ #include #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "pblock.h" #include "phrase.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" /** Unix system calls that need to be declared: **/ /** Unix/C Library Functions: **/ /** lqtext functions: **/ /** functions within this file that need forward declarations **/ /** **/ static t_FID *FilesInShortestEntry = 0; static t_FID *FilesInShortestEntryPointer; static long PlacesAccepted; #if 0 static long FilesInShortestEntryDeleted = 0; #endif PRIVATE int ContinuesMatch( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_PhraseItem *PreviousWord, t_PhraseItem *Word, register t_WordPlace *First, t_WordPlace *Prev, register t_WordPlace *New, t_Position Position #endif ); PRIVATE int AcceptWordFunc(db, WID, WordPlace) t_LQTEXT_Database *db; t_WID WID; t_WordPlace *WordPlace; { t_FID LastPlausible = (*FilesInShortestEntryPointer); while (*FilesInShortestEntryPointer < WordPlace->FID) { /* not there yet */ if (*FilesInShortestEntryPointer == (t_FID) 0) { return 0; } /* delete the skipped file so we have fewer files next time round, * although the same number of entries in the array. * It's hard to compress the array without introducing an * O(nfile^2) behaviour, slowing matching down somewhat. */ if (*FilesInShortestEntryPointer != LastPlausible) { *FilesInShortestEntryPointer = LastPlausible; } ++FilesInShortestEntryPointer; } if (*FilesInShortestEntryPointer > WordPlace->FID) { /* gone past it */ return 0; } else if (*FilesInShortestEntryPointer == WordPlace->FID) { ++PlacesAccepted; return 1; } else { /* got to the end! */ return 0; } } /* * LQT_MakeMatchesWhere * Retrieval/Matching, Retrieval/Phrases * *

Matches the given phrase, and returns the number of successful * matches. The given AcceptFunction is called for each match; it * must return one of the following flags as defined in * phrase.h: either LQMATCH_ACCEPT, which adds the match to the * result, or LQMATCH_REJECT, which does not add the match to the result. * In addition, either of these flags may be combined (using bitwise or) * with LQMATCH_QUIT, in which case LQT_MakeMatchesWhere will return * the result collected so far and abandon further processing, or * LQMATCH_NEXT_FILE, in which case LQT_MakeMatchesWhere will not * call the AcceptFunction again until a match is found in a document * with a different File Identifier (FID).

*

A NULL AcceptFunction pointer is equivalent to one that always * returns LQMATCH_ACCEPT, except much more efficient.

* * The number of matches accepted. * All matches that are accepted are stored in the given Phrase object. * * LQT_StringToPhrase *
*/ API long LQT_MakeMatchesWhere(db, Phrase, AcceptFunction) t_LQTEXT_Database *db; t_Phrase *Phrase; int (*AcceptFunction)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_Phrase *, t_Match * #endif ); { /* Each word has a pointer (SearchIndex) to the last Word Place * that was examined. This enables an O(NumberOfWords) search instead * of O(NumberOfWords * NumberOfWords) search. */ unsigned long PIFW; /* Place In First Word */ t_MatchList **MLPP = &(Phrase->Matches); t_Match **MPP; t_Match **OldMatch = (t_Match **) 0; t_WordPlace *pp; t_PhraseItem *Word; t_PhraseItem *PreviousWord = 0; long Result = 0L; t_PhraseItem *LeastWord; int HowGood; long LengthOfFileArray = 0L; t_FID PendingFID = 0; if (!Phrase) { return 0L; } LQT_ResetPhraseMatch(db, Phrase); /* Each iteration over this list either produces a match or rejects a * possible phrase starting place. */ LQT_Trace(LQTRACE_MATCH_PHRASE, "match phrase \"%s\"", Phrase->ModifiedString ); /* A phrase with garbage words can't match anything */ if (Phrase->HasUnknownWords && db->PhraseMatchLevel != PCM_AnyCase) { return 0L; } /* Find the word in the phrase with least matches: */ LeastWord = Phrase->Words; for (Word = Phrase->Words; Word; Word = Word->Next) { if (Word->Word->NumberOfWordPlaces < LeastWord->Word->NumberOfWordPlaces) { LeastWord = Word; } } /* for efficiency, let's see which files we have to fetch */ if (FilesInShortestEntry) { efree((char *) FilesInShortestEntry); } FilesInShortestEntry = (t_FID *) emalloc( "FIDS in Phrase", sizeof(t_FID) * (LeastWord->Word->NumberOfWordPlaces + 1) /* allow +1 for trailing 0 */ ); /* check we have the postings: */ if (LeastWord->Word->WordPlacesInHereWord->NumberOfWordPlaces){ t_WordInfo *W = LeastWord->Word; /* less indirection! */ if (W->WordPlaces) { (void) efree((char *) W->WordPlaces); } W->WordPlaces = LQT_GetWordPlaces( db, W->WID, W->WordPlaceStart, (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), W->Offset, &W->NumberOfWordPlaces ); W->WordPlacesInHere = W->NumberOfWordPlaces; } /* now complete the list of files that are possible candidates */ { register long i, j; t_WordInfo *W = LeastWord->Word; /* less indirection! */ FilesInShortestEntry[i = 0] = W->WordPlaces[0].FID; for (j = 1; j < W->NumberOfWordPlaces; j++) { if (W->WordPlaces[j].FID != FilesInShortestEntry[i]) { i++; FilesInShortestEntry[i] = W->WordPlaces[j].FID; } } FilesInShortestEntry[++i] = (t_FID) 0; LengthOfFileArray = i + 1; FilesInShortestEntry = (t_FID *) erealloc( (char *) FilesInShortestEntry, (unsigned int) (LengthOfFileArray * sizeof(t_FID)) ); } /* Ensure that the matches have been read */ for (Word = Phrase->Words; Word; Word = Word->Next) { if (Word->Word->WordPlacesInHere < Word->Word->NumberOfWordPlaces) { t_WordInfo *W = Word->Word; /* less indirection! */ if (W->WordPlaces) { (void) efree((char *) W->WordPlaces); } FilesInShortestEntryPointer = FilesInShortestEntry; PlacesAccepted = 0L; W->WordPlaces = LQT_GetWordPlacesWhere( db, W->WID, W->WordPlaceStart, (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), W->Offset, &W->NumberOfWordPlaces, AcceptWordFunc ); #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "%s: %ld/%ld wanted\n", W->Word, PlacesAccepted, W->NumberOfWordPlaces ); #endif W->WordPlacesInHere = W->NumberOfWordPlaces = PlacesAccepted; if (!PlacesAccepted) { return (long) 0; } #if 0 /* spot trailing unwanted documents: */ if (*FilesInShortestEntryPointer) { if (*++FilesInShortestEntryPointer) { *FilesInShortestEntryPointer = (t_FID) 0; FilesInShortestEntryDeleted += ( LengthOfFileArray +1 - (FilesInShortestEntryPointer - FilesInShortestEntry) ); /* (+1 for the trailing zero) */ } } if (Word->Next && FilesInShortestEntryDeleted > 1) { register t_FID *WhereWeAreAt; register t_FID *This; WhereWeAreAt = This = FilesInShortestEntry; while (*This) { if (*This == This[1]) { t_FID Bad = (*This); while (*This == Bad) { This++; } continue; } if (This != WhereWeAreAt) { *(WhereWeAreAt++) = *(This++); } else { ++WhereWeAreAt; ++This; } } *WhereWeAreAt = (t_FID) 0; LengthOfFileArray = (WhereWeAreAt - FilesInShortestEntry) + 1; FilesInShortestEntry = (t_FID *) erealloc( FilesInShortestEntry, LengthOfFileArray * sizeof(t_FID) ); } #endif } } /* For each match in the first word in the phrase: */ for (PIFW = 0; PIFW < Phrase->Words->Word->NumberOfWordPlaces; PIFW++) { t_WordPlace *LastTryWord = (t_WordPlace *) 0; /* The idea is that the next two loops are are likely to reduce * considerably the number of places we have to consider in the * case that the first word in the phrase has a lot of matches * and there is a subsequent word with relatively few matches. * Experiments suggest that this is fairly common. * * This is still a nearly (i.e. slightly-better-than) linear * algorithm w.r.t the total number of matches in all of the * words added up. Note that I alter LeastWord->SearchIndex in * one of the two loops that follow, so when WordPlaces from that * word are considered, we don't have to look at any twice. * * In order to do better, one would have to be able to avoid * looking at some or (better!) most of the WordPlaces. * * For example, not fetching so many from disk: * if we didn't do the fetches until we needed to, and we gave * LQT_GetWordPlaces a minimum FID to look for, we might be able * to reduce things by (say) 15%. * If all of the FIDS were stored separately, we would not * have to look at the (Block, Word, Flags, StuffBefore) stuff at * all, and that would be much faster. One way to do that might be * to store the list of FIDs with the word (as now), and perhaps * some flags and the count of words/fid, and to store the rest * in a per-file data structure. * * That would be a major, major hack... * ... sigh. */ while (LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID < Phrase->Words->Word->WordPlaces[PIFW].FID) { if (++(LeastWord->SearchIndex) >= LeastWord->Word->NumberOfWordPlaces) { goto GiveUp; } } while (Phrase->Words->Word->WordPlaces[PIFW].FID < LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID) { if (++PIFW >= Phrase->Words->Word->NumberOfWordPlaces) { goto GiveUp; } } /* The following comment tells Sabre_C not to moan about "if (0)" */ /*SUPPRESS558*/ if (0) { GiveUp: break; } MPP = (t_Match **) 0; pp = &Phrase->Words->Word->WordPlaces[Phrase->Words->SearchIndex=PIFW]; /* When we have a partially completed match, * TryWord (declared below) will point to the WordPlace currently * being considered to see if it extends the partial match; * LastTryWord points to the previous WordPlace in the match. */ /* For each word in the phrase: */ for (Word = Phrase->Words; Word; Word = Word->Next) { int GotOne = 0; /* For each occurrence of that word: */ for (; Word->SearchIndex < Word->Word->NumberOfWordPlaces; ++Word->SearchIndex) { register t_WordPlace *TryWord = &Word->Word->WordPlaces[Word->SearchIndex]; if (!LastTryWord) { LastTryWord = TryWord; } /** So: ** | int PIFW = each match in the first word in the phrase ** | t_WordPlace *pp = each match in the phrase ** | t_PhraseItem *Word = each word in the phrase ** | unsigned SearchIndex = each match of that word ** | t_WordPlace *TryWord = each occurrence of that word ** ** Hence, we are comparing pp and TryWord, hoping that each time ** round the (Word) loop we will advance TryWord. ** Once we have decided that TryWord and pp relate to the ** same file and that TryWord is no earlier than pp in the ** file, we must then check that TryWord is advancing the ** chain by comparing it to the previous element in the ** list (LastTryWord). ** ** When we break from this inner list, we must either have ** eliminated this particular (PIFW) as starting a match- ** chain, or have decided that we have extended the ** current match chain (by setting GotOne). **/ if (LastTryWord == TryWord) { /* same word! */ HowGood = LQTp_CheckFlags( db, Word->Word, TryWord, (Word == Phrase->Words) ? e_First : ( ( (Word->Next) ? e_Middle : e_Last) ) ); } else if (pp->FID == TryWord->FID) { HowGood = ContinuesMatch( db, PreviousWord, Word, pp, LastTryWord, TryWord, (Word == Phrase->Words) ? e_First : ( ( (Word->Next) ? e_Middle : e_Last) ) ); } else { /* different FIDs */ HowGood = (pp->FID > TryWord->FID) ? -1 : 1; } switch (HowGood) { case 0: /* G O T C H A !!!! */ /* extend the HitList, since it's OK so far. */ /* allocate a new match: */ if (!MPP) { *MLPP = (t_MatchList *) emalloc("MatchList entry for LQT_MakeMatches", sizeof(t_MatchList) ); (*MLPP)->Match = (t_Match *) 0; OldMatch = MPP = &((*MLPP)->Match); MLPP = &(*MLPP)->Next; *MLPP = (t_MatchList *) 0; } *MPP = (t_Match *) emalloc("Phrase Match", sizeof(t_Match)); (*MPP)->WID = Word->Word->WID; (*MPP)->Where = TryWord; (*MPP)->Next = (t_Match *) 0; MPP = &(*MPP)->Next; GotOne++; break; case 1: /* gone too far */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { t_WordInfo WW; WW = *(Word->Word); if (LastTryWord == TryWord) { /* LQT_GenerateWordFromRoot() returns a pointer * to a static buffer, * so I have to use two printf() calls here. */ fprintf(stderr, "Reject(%s (%d) != ", LQT_GenerateWordFromRoot(db, &WW, WW.WordPlace.Flags ), WW.WordPlace.Flags ); fprintf(stderr, "%s (%d)) [flags]\n", LQT_GenerateWordFromRoot( db, &WW, TryWord->Flags ), TryWord->Flags ); } else { fprintf(stderr, "Reject(%s) -- too far, FIDS %ld %ld\n", LQT_GenerateWordFromRoot( db, &WW, WW.WordPlace.Flags ), pp->FID, TryWord->FID ); } } #endif break; case -1: continue; /* not there yet */ default: Error(E_FATAL|E_INTERNAL, "%s: %d HowGood == %d", __FILE__, __LINE__, HowGood ); } /* Remember where we got up to... so that we can extend * the list when we start looking at the next word. */ LastTryWord = TryWord; #ifdef ASCIITRACE if (GotOne && LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { t_WordInfo WW; WW = *(Word->Word); /* LQT_GenerateWordFromRoot() returns a pointer to a * static buffer */ fprintf(stderr, "Partial match for %s ", LQT_GenerateWordFromRoot( db, &WW, Word->Word->WordPlace.Flags ) ); fprintf(stderr, "(%s,%lu,%lu) FID %lu)\n", LQT_GenerateWordFromRoot(db, &WW, TryWord->Flags), TryWord->BlockInFile, TryWord->WordInBlock, TryWord->FID ); } #endif /* If we got to here, we extended the list, which is fine; * otherwise, if we hit a continue, we try to carry on * looking at matches of this word, and if we hit a break * before we set "GotOne", we give up on this match * altogether. */ break; } /* For each occurrence of that word: */ if (!GotOne) { t_Match *MP; /* This word isn't here, so neither is the phrase found * in this file starting here. */ if (MPP) { for (MP = (*OldMatch); MP != (t_Match *) 0; /*void*/) { t_Match *Next = MP->Next; efree((char *) MP); MP = Next; } *OldMatch = (t_Match *) 0; } break; } else if (Word->Next == (t_PhraseItem *) 0) { /* If we've reached the end of the phrase, i.e. if * Word->Next is zero, we have successfully added a new * phrase! */ if (PendingFID) { if (PendingFID > Word->Word->WordPlaces[Word->SearchIndex].FID ) { goto reject; } else { PendingFID = 0; } } if (AcceptFunction) { int action = (*AcceptFunction)(db, Phrase, *OldMatch); if (action & LQMATCH_NEXT_FILE) { PendingFID = Word->Word->WordPlaces[Word->SearchIndex].FID + 1; action |= ~LQMATCH_NEXT_FILE; } switch (action & (~LQMATCH_QUIT)) { case LQMATCH_ACCEPT: /* OK */ Result++; #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "OK, Result now %d", Result ); #endif break; case LQMATCH_REJECT: reject: /* free the match structure */ #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "AcceptFunc rejected match, still %d", Result ); #endif if (MPP) { t_Match *MP = (*OldMatch); while (MP != (t_Match *) 0) { t_Match *Next = MP->Next; efree((char *) MP); MP = Next; } } *OldMatch = (t_Match *) 0; break; default: Error(E_FATAL|E_BUG, "LQT_MakeMatchesWhere: AcceptFunction returned illegal action %d", action ); } if (action & LQMATCH_QUIT) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "AcceptFunc returned LQMATCH_QUIT, giving up at %d", Result ); #endif LQT_ResetPhraseMatch(db, Phrase); return Phrase->NumberOfMatches = Result; } } else { /* no AcceptFunction, so accept automatically */ ++Result; #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Result now %d\n", Result ); #endif } } PreviousWord = Word; } /* end for (each word in the phrase) */ } /* end (for each FID/Offset pair in the first word */ LQT_ResetPhraseMatch(db, Phrase); return Phrase->NumberOfMatches = Result; } /* * LQT_MakeMatches * Retrieval/Matching, Retrieval/Phrases * * This is equivalent to LQT_MakeMatchesWhere with a null AcceptFunction, * and is provided for convenience. * * LQT_StringToPhrase * LQT_MakeMatchesWhere * */ API long LQT_MakeMatches(thedb, Phrase) t_LQTEXT_Database *thedb; t_Phrase *Phrase; { /* Call LQT_MakeMatchesWhere with a NULL callback function: */ return LQT_MakeMatchesWhere(thedb, Phrase, (int (*)())0); } PRIVATE int ContinuesMatch(db, PreviousWord, Word, First, Prev, New, Position) t_LQTEXT_Database *db; t_PhraseItem *PreviousWord; t_PhraseItem *Word; register t_WordPlace *First; t_WordPlace *Prev; register t_WordPlace *New; t_Position Position; { register t_WordInfo *QueryWord = Word->Word; /* Return Value is * -1 --- if New occurs before Prev (and thus isn't part of the match) * 0 --- if it's the next word in the match * +1 --- if we've gone past it * Note: you can use these values in a switch() if you want. */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "ContinuesMatch: %*.*s [f 0%o %ld %lu] fby %*.*s [f 0%o %ld %lu]", PreviousWord ? PreviousWord->Word->Length : 0, PreviousWord ? PreviousWord->Word->Length : 0, PreviousWord ? PreviousWord->Word->Word : "", Prev->Flags, Prev->BlockInFile, Prev->WordInBlock, Word->Word->Length, Word->Word->Length, Word->Word->Word, New->Flags, New->BlockInFile, New->WordInBlock ); } #endif /* First check we are looking at the right file: * Have we gone far enough? */ if (New->FID < First->FID) { return -1; /* not far enough */ } else if (New->FID > First->FID) { return 1; /* too far */ } else if (Prev == New) { return 0; } /* Hey everybody, they're the same! * That means that this might be a candidate for a MATCH!!!! */ /* if (SimplyAnywhereWillDo) { OK; break; } */ /* Clearly later words in the phrase can't be in earlier * blocks... * Note that the converse isn't true: if the phrase spans a block * boundary, the earlier word will be in an earlier block than the * next one. */ if (New->BlockInFile < First->BlockInFile) { /* Although we are in the right file, we have not * yet reached the correct offset. */ return -1; } /* If we get to here, * . we are in the right file * . we are at least as far into the file as the start * of the phrase */ /* Now check that we are a reasonable distance past * the preceding word (checking that we are not on the first * match in the list, of course): */ if (New->BlockInFile < Prev->BlockInFile) { /* not gone far enough */ return -1; } if (New->BlockInFile > Prev->BlockInFile + 1) { /* If they are more than one block apart, I * don't believe them to be part of a phrase! */ return 1; } if (New->BlockInFile == Prev->BlockInFile) { long DistanceApartInQuery = 1; if (PreviousWord && PreviousWord->Word) { DistanceApartInQuery = Word->Word->WordPlace.WordInBlock - PreviousWord->Word->WordPlace.WordInBlock; } /* If they are in the same block, one must be * exactly one word beyond the other. I don't * think they can ever be the same, unless there * is a bug somewhere! (Query expansion and anaphoristic * reference handling might change that in the future, though!) * * However, if common words were skipped in the query, * there may be a gap. * * In principle, it's possible for a "common word" -- e.g. a number -- * to span a whole block, and then we can't match words on both * sides of it. Variable sized blocks will cure this, as then a * phrase will never span a block. */ if (New->WordInBlock <= Prev->WordInBlock) { /* too early in the block */ return -1; } switch (db->PhraseMatchLevel) { case PCM_AnyCase: if (New->WordInBlock > Prev->WordInBlock + DistanceApartInQuery + 2) { return 1; /* gone too far */ /* We allow a few words slop in this case, though... */ } break; /* clearly OK */ case PCM_SameCase: case PCM_HalfCase: default: if (New->WordInBlock > Prev->WordInBlock + DistanceApartInQuery) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "ContinuesMatch: reject: WIB: %lu !fby %lu", Prev->WordInBlock, New->WordInBlock ); } #endif return 1; /* gone too far */ } } } else { int MaxWordInBlock = 0; switch (db->PhraseMatchLevel) { case PCM_AnyCase: MaxWordInBlock = 1; break; /* clearly OK */ case PCM_SameCase: case PCM_HalfCase: default: MaxWordInBlock = 0; } /* If there is a block boundary between the two words, * the 2nd word should be word zero in its block. * If we are allowing approx. matches, we'll let an extra * word slip in there. */ if (New->WordInBlock > MaxWordInBlock) { /* the previous word was at the end of a block, * but this one isn't at the start! */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "ContinuesMatch: reject: WIB > %d: %ld.%lu !fby %ld.%lu", MaxWordInBlock, Prev->BlockInFile, Prev->WordInBlock, New->BlockInFile, New->WordInBlock ); } #endif return 1; /* gone too far */ } /* they are in adjacent blocks */ if (!(Prev->Flags & WPF_LASTINBLOCK)) { /* there is another word between them, so * we have gone too far. * I went to a lot of effort in addfile.c to * mantain that flag, just for this! */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "ContinuesMatch: reject: !LAST: %ld.%lu !fby %ld.%lu", Prev->BlockInFile, Prev->WordInBlock, New->BlockInFile, New->WordInBlock ); } #endif return 1; } } /* Now we check that the word matches the given word * -- in other words, that possessive/plural/case * is correct if required. Do this later as it is * relatively expensive I expect, and we will not * usually care about case. * * Since the word is in the right place, if it fails here there * is no point in looking at the next word in this block! */ return LQTp_CheckFlags(db, QueryWord, New, Position); } LIBRARY int LQTp_CheckFlags(db, QueryWord, New, Position) t_LQTEXT_Database *db; t_WordInfo *QueryWord; t_WordPlace *New; t_Position Position; { int StuffBeforeInQuery, StuffBeforeInNew; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "LQTp_CheckFlags %*.*s (%s Qf=0%ocmp f=0%o", QueryWord->Length, QueryWord->Length, QueryWord->Word, Position == e_First ? "First" : ( (Position == e_Middle ) ? "Middle" : "Last" ), QueryWord->WordPlace.Flags, New->Flags ); } #endif /* Phrase matching levels currently have silly names, sorry. */ switch (db->PhraseMatchLevel) { default: /* defensive! */ Error(E_FATAL|E_INTERNAL|E_BUG, "%d %s: unknown phrase match level %d", __LINE__, __FILE__, db->PhraseMatchLevel ); case PCM_AnyCase: break; /* clearly OK */ case PCM_SameCase: /* precise matching */ StuffBeforeInQuery = LQT_WPF_STUFF_BEFORE(&QueryWord->WordPlace); if (StuffBeforeInQuery > 1 || Position != e_First) { int Difference; StuffBeforeInNew = LQT_WPF_STUFF_BEFORE(New); Difference = StuffBeforeInQuery - StuffBeforeInNew; if (Difference < -2 || Difference > 4) { /* allow stuff before to be too small on the first word; * if Difference > 0, the query has more space &| punctuation * at the start than the indexed text, so this isn't a * match. */ if (Position != e_First || Difference > 0) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: stuff before Q %d Try %d", QueryWord->Length, QueryWord->Length, QueryWord->Word, StuffBeforeInQuery, StuffBeforeInNew ); } #endif return -1; /* give up on this match */ } } } /** for things before the current word, be "heuristic" if ** it's the first word in the phrase, as the user can't have ** known about the context of the putative match, only its contents. **/ /* Skipped common words: */ if (Position == e_First) { /* It's the first word in the query, so only check for things * before this word if they were specified: */ if ((LQT_WPF_LASTHADLETTERS(&QueryWord->WordPlace)) && !(LQT_WPF_LASTHADLETTERS(New))) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: last had letters(q but not w)", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; /* give up on this match */ } if (LQT_WPF_LASTWASCOMMON(&QueryWord->WordPlace) && !LQT_WPF_LASTWASCOMMON(New)) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: skipped common q but not w", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; } if (Position != e_Last) { if ((LQT_WPF_NEXTHASPUNCT(&QueryWord->WordPlace)) && !(LQT_WPF_NEXTHASPUNCT(New))) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: next has punct q but not w", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; /* give up on this match */ } } } else if (Position == e_Last) { if (LQT_WPF_NEXTHASPUNCT(&QueryWord->WordPlace) && !LQT_WPF_NEXTHASPUNCT(New)) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: NEXTHASPUNCT q but not w", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif } if (LQT_WPF_NEXTISCOMMON(&QueryWord->WordPlace) && !LQT_WPF_NEXTISCOMMON(New) ) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { StuffBeforeInNew = LQT_WPF_STUFF_BEFORE(New); LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: NEXTISCOMMON w but not not q", QueryWord->Length, QueryWord->Length, QueryWord->Word, StuffBeforeInQuery, StuffBeforeInNew ); } #endif return -1; } } else { /* middle of query */ if (LQT_WPF_NEXTHASPUNCT(New) != LQT_WPF_NEXTHASPUNCT(&QueryWord->WordPlace)) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: NEXTHASPUNCT w != q", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; } if (LQT_WPF_NEXTISCOMMON(New) != LQT_WPF_NEXTISCOMMON(&QueryWord->WordPlace)) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { StuffBeforeInNew = LQT_WPF_STUFF_BEFORE(New); LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: NEXTISCOMMON w != q", QueryWord->Length, QueryWord->Length, QueryWord->Word, StuffBeforeInQuery, StuffBeforeInNew ); } #endif return -1; } } /* All the non-context-sensitive flags at once: */ if ((QueryWord->WordPlace.Flags & (WPF_WASPLURAL|WPF_UPPERCASE)) != (New->Flags & (WPF_UPPERCASE|WPF_WASPLURAL))) { /* The cases are different, no good */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: plural/case q != w", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; /* give up on this match */ } /* Finally, the least reliable, so always heuristic: */ if ((LQT_WPF_POSSESSIVE(&QueryWord->WordPlace)) && !(LQT_WPF_POSSESSIVE(New))) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MATCH_PHRASE)) { LQT_Trace(LQTRACE_MATCH_PHRASE, "reject %*.*s: possessive q but not w", QueryWord->Length, QueryWord->Length, QueryWord->Word ); } #endif return -1; /* give up on this match */ } break; case PCM_HalfCase: /* In this case, we are lax about things, but if the * user typed plural/possessive/capital, we only * match one with the same attribute. */ if ((LQT_WPF_UPPERCASE(&QueryWord->WordPlace)) && !(LQT_WPF_UPPERCASE(New))) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [uppercase]"); #endif return -1; /* give up on this match */ } /* plurals: this should be separate */ if ((LQT_WPF_WASPLURAL(&QueryWord->WordPlace)) && !(LQT_WPF_WASPLURAL(New))) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [plural]"); #endif return -1; /* give up on this match */ } /* Now, what about skipped common words? */ if ((LQT_WPF_NEXTISCOMMON(&QueryWord->WordPlace)) && !(LQT_WPF_NEXTISCOMMON(New))) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [next-is-common]"); #endif return -1; /* give up on this match */ } /* Stuff before, if given, must be present: */ StuffBeforeInQuery = LQT_WPF_STUFF_BEFORE(&QueryWord->WordPlace); if (StuffBeforeInQuery > 1) { int Difference; StuffBeforeInNew = LQT_WPF_STUFF_BEFORE(New); Difference = StuffBeforeInQuery - StuffBeforeInNew; if (Difference < -2 || Difference > 4) { /* allow stuff before to be too small on the first word; * if Difference > 0, the query has more space &| punctuation * at the start than the indexed text, so this isn't a * match. */ if (Position != e_First || Difference > 0) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [Stuff Before %d != Q%d]", StuffBeforeInNew, StuffBeforeInQuery ); #endif return -1; /* give up on this match */ } } } if (LQT_WPF_NEXTHASPUNCT(&QueryWord->WordPlace) && !LQT_WPF_NEXTHASPUNCT(New)) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [punctuation %s %o != %s Q%o]", (LQT_WPF_NEXTHASPUNCT(New)) ? "yes" : "no", New->Flags, (LQT_WPF_NEXTHASPUNCT(&QueryWord->WordPlace)) ? "yes" : "no", QueryWord->WordPlace.Flags ); #endif return -1; /* give up on this match */ } if (LQT_WPF_POSSESSIVE(&QueryWord->WordPlace) && !LQT_WPF_POSSESSIVE(New)) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Reject [possessive]"); #endif return -1; /* give up on this match */ } break; } /* If we got here... * we couldn't find anything wrong. */ #ifdef ASCIITRACE LQT_Trace(LQTRACE_MATCH_PHRASE, "Accept"); #endif return 0; /* It's all OK! */ }