/* wordtable.c -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file ../COPYRIGHT for full details. */ /* Symbol Table Interface to text retrieval database. * Handles both the internal and external indexes. * * This originally used a linked list. Converting to a hash table reduced * the time to index comp.os.vms from nearly an hour to one and a half * minutes... * * Liam Quin, 1989 and later */ /* * $Id: wordtable.c,v 2.42 2001/05/31 03:50:13 liam Exp $ */ #ifndef lint static char *Rcs = "$Id: wordtable.c,v 2.42 2001/05/31 03:50:13 liam Exp $"; #endif #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #ifdef HAVE_SYSV_FCNTL_H # include #endif #ifdef HAVE_FCNTL_H # include /* for O_RDWR wtc */ #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "smalldb.h" #include "fileinfo.h" #include "wordinfo.h" #include "wordplace.h" #include "pblock.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" typedef struct s_HashEl { t_WID WID; int PlacesUsed; t_WordPlace *Places; unsigned char WordLength; char Word[1]; } t_HashEl; #ifndef HASHSIZ # define HASHSIZ 32768 /* MUST be a power of two */ #endif /*!HASHSIZ*/ #ifndef MAXWORDSINCACHE # define MAXWORDSINCACHE (HASHSIZ * 10) #endif int MaxWordsInCache = MAXWORDSINCACHE; /** System calls and library functions used in this file: **/ /** Lqtext calls */ /** calls within this file **/ PRIVATE void NewEntry( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_HashEl *HashEl #endif ); PRIVATE void UpdateEntry( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_HashEl *HashEl #endif ); void DumpCache( /* see ../h/addfile.h */ #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); PRIVATE void FastDump( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); PRIVATE void DumpOldest( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); /** System calls: */ /** Library Functions: */ /**/ PRIVATE int HashSize = HASHSIZ; /* MUST be a power of two */ API int SetHashSize(Options, theNewSize) t_lqdbOptions *Options; int theNewSize; { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Hash table size changed from %ld to %ld", HashSize, theNewSize ); return HashSize = theNewSize; } #define NPLACES 2 #define NPLACESBIGINCR 6 #define NPLACESHUGEINCR (NPLACESBIGINCR * 4) #define NPLACESMAXINCR (NPLACESHUGEINCR * 5) /* This is small to optimise the common case -- by far the majority of * words are used less than 10 times. In the cases where we've gone * wrong, well, there'll be a few thousand. We add slowly until we * get to NPLACE * 3, and then we go up in NPLACESMAXINCR lumps. */ PRIVATE t_HashEl **SymbolTable; PRIVATE int LastEl = 0; PRIVATE int WordsInCache = 0; PRIVATE int DumpThresh = DUMP_FAST_THRESH; PRIVATE int SlotsUsed = 0; #ifdef USE_RADIX_MAP_FOR_SHORT_WORDS typedef struct { t_WID WID; int PlacesUsed; t_WordPlace *Places; } t_ShortWordHashEl; PRIVATE int UseShortWordMap = 1; PRIVATE int SWM_StartCount, SWM_EndCount; PRIVATE unsigned long SWM_TotalShortPlaces; PRIVATE t_ShortWordHashEl **ShortWords; PRIVATE int *OneLetterMap; PRIVATE int *TwoLetterMap; #define TwoWordSlot(W) \ (OneLetterMap[(W)->Word[0]] * SWM_EndCount) + \ (TwoLetterMap[(W)->Length == 1 ? 0 : (W)->Word[1]]) PRIVATE void InitialiseOneLetterMap(db) t_LQTEXT_Database *db; { register int i; int nStart, nWithin; int pos; if (!UseShortWordMap || OneLetterMap != 0) { Error(E_FATAL|E_BUG|E_INTERNAL... ); } OneLetterMap = (int *) malloc(sizeof(int) * 256); TwoLetterMap = (int *) malloc(sizeof(int) * 256); TwoLetterMap[0] = 0; /* for representing 1-char words, see below... */ /* first, count the number of characters that are allowed at * the start of a word and in the middle or end. */ for (nStart = nWithin = i = 0; i < 256; i++) { OneLetterMap[i] = TwoLetterMap[i] = 0; if (LQT_STARTS_WORD(db, i)) { OneLetterMap[i] = nStart; ++nStart; } if (LQT_WITHIN_OR_ENDS_WORD(db, i)) { ++nWithin; TwoLetterMap[i] = nWithin; } } /* As well as every character legal within a word, we allow * 0 for a terminator. ASUUMPTION: a null character (not byte) is * not allowed within a word */ if (nStart > 1000 || nWithin > 1000 || nStart * nWithin > MaxWordsInCache / 4 ) { UseShortWordMap = 0; return; } SWM_StartCount = nStart; /* to allow for the 0, so that we can represent 1-character words * with a 0 in the second place, we add 1 to the number of * characters that can end a word: */ SWM_EndCount = nWithin + 1; ShortWords = (t_ShortWordHashEl **) ecalloc( "Short Word Table", sizeof(t_ShortWordHashEl) * SWM_StartCount * SWM_EndCount ); } #endif void SetDumpThresh(Options, Thresh) t_lqdbOptions *Options; int Thresh; { /* Set the threshhold for fast dumping. * If a word has less than this many occurrences in the cache, it gets * written out. -1 disables this feature, and 0 uses the default. */ DumpThresh = Thresh; if (!DumpThresh) { DumpThresh = DUMP_FAST_THRESH; } } PRIVATE void InitHash(db) t_LQTEXT_Database *db; { long tmp = 1; while (tmp < HashSize) { tmp <<= 1; } HashSize = tmp; if (HashSize < 1) { Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n", HashSize, MaxWordsInCache); } SymbolTable = (t_HashEl **) ecalloc( "AddWord symbol table", HashSize, sizeof(t_HashEl *) ); LastEl = HashSize; /* Used as a sentinel */ #ifdef ASCIITRACE LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "allocated %ld hash slots for up to %ld words", HashSize, MaxWordsInCache ); LQT_Trace(LQTRACE_DEBUG, "Sizes: hash: %ud; hash[2] %ud", sizeof(t_HashEl), sizeof(t_HashEl[2]) ); #endif } #ifndef Hash PRIVATE INLINE int Hash(Word, Length) register char *Word; register int Length; { register unsigned long n = 0; #define HASHC n = *Word++ + 65599 * n #ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */ if (Length > 0) { register int loop = (Length + 8 - 1) >> 3; switch(Length & 07) { case 0: do { HASHC; case 7: HASHC; case 6: HASHC; case 5: HASHC; case 4: HASHC; case 3: HASHC; case 2: HASHC; case 1: HASHC; } while (--loop); } } #else /* NODUFF */ while (Length--) { HASHC; } #endif /* NODUFF */ return n & (HashSize - 1); } #endif /* Hash */ PRIVATE char FirstTimeRound = 1; PRIVATE t_FID CurrentFID = 0; static int LargestNumberOfPlacesInOneSlot = 1; /*extern*/ void AddWord(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { register t_HashEl *HashEl = (t_HashEl *) 0; int Slot, FirstSlot; static t_FID LastFID = 0; int ProbeCount = 0; register t_HashEl **Slotp; if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) { Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo); return; } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, " <%s/%d, b %d, w %lu f %d s %d> ", WordInfo->Word, WordInfo->Length, WordInfo->WordPlace.BlockInFile, WordInfo->WordPlace.WordInBlock, WordInfo->WordPlace.Flags, WordInfo->WordPlace.StuffBefore ); } #endif if (!LastEl) { InitHash(); if (FirstTimeRound) { /* Special check to save looking up the WIDs first time round */ t_WID W = LQT_GetMaxWID(db); if (W == 0L) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "first ever run, allocating WIDs." ); FirstTimeRound = 1; /* actually it's already 1 here */ } else { FirstTimeRound = 0; } } } else { if (MUST_DUMP(WordsInCache, MaxWordsInCache, SlotsUsed, HashSize)) { DumpCache(db, DUMP_CACHE|DUMP_FAST); } if (WordInfo->WordPlace.FID != LastFID) { if (WordInfo->WordPlace.FID == 0L) { Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word); } else if (LastFID != 0) { /* quick opportunity to get rid of words that * were only in the previous file -- this takes advantage * of the fact that vocabulary is document-specific, * i.e. you tend to get words used in one file and not * another. */ CurrentFID = WordInfo->WordPlace.FID; if (MUST_FILE_DUMP( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { DumpCache(db, DUMP_CACHE|DUMP_FAST); } } LastFID = WordInfo->WordPlace.FID; } } #ifdef USE_RADIX_MAP_FOR_SHORT_WORDS /* There are approx. 61 possible first letters, and * approx 68 possible 2nd letters in ISO 8859-1 with the default * wordrules. If we avoid hashing for 1 and 2 character strings, * which are fairly frequent, we will get a performance improvement. * We can also save al ittle on storage, by using a different data * structure, and by not storing the 1- or 2- char word. * * I wish it was practical to do this for 3- and 4-char strings, as * "the" and "and" are usually extremely common, and saving a few * million hashings can't be bad. */ if (WordInfo->Length <= 2) { TODO fixme use both characters! FirstSlot = Slot = TwoWordSlot(WordInfo); Slotp = &OneLetter[Slot]; if (!*Slotp) { HashEl = *Slotp = (t_ShortWordHashEl *) emalloc( "wordtable::AddWord.TwoWordSlotHashEl", sizeof(t_ShortWordHashEl) ); HashEl->PlacesUsed = 0; HashEl->Places = (t_WordPlace *) emalloc( "SetSWElEmpty.Places", sizeof(t_WordPlace) * NPLACES ); if (FirstTimeRound) { /* No point looking for the WID, it won't be there; * code in DumpCache() will assign a new one. * LQT_WordToWID() will always return zero on a new index. */ HashEl->WID = 0; } else { HashEl->WID = LQT_WordToWID( db, WordInfo->Word, WordInfo->Length ); } } HashEl = *Slotp; goto OK; } else { FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length); Slotp = &SymbolTable[Slot]; } #else FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length); Slotp = &SymbolTable[Slot]; #endif for (ProbeCount = 0; ProbeCount < MAXPROBES; ProbeCount++) { if (*Slotp == (t_HashEl *) NULL) { /* make a new element */ HashEl = *Slotp = (t_HashEl *) emalloc( "wordtable::AddWord.HashEl", sizeof(t_HashEl) + WordInfo->Length ); HashEl->PlacesUsed = 0; HashEl->Places = (t_WordPlace *) emalloc( "SetElEmpty.Places", sizeof(t_WordPlace) * NPLACES ); (void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length); HashEl->Word[WordInfo->Length] = '\0'; HashEl->WordLength = (unsigned char) WordInfo->Length; if (FirstTimeRound) { /* No point looking for the WID, it won't be there; * code in DumpCache() will assign a new one. * LQT_WordToWID() will always return zero on a new index. */ HashEl->WID = 0; } else { HashEl->WID = LQT_WordToWID(db, HashEl->Word, WordInfo->Length); } ++SlotsUsed; break; } else if ( /* first compare the first letter (26 possibilities, not all * equiprobable, but more than 10 likely ones; well, actually * it's more than 26, because of numbers, accented characers * and punctuation -- over 60 characters with ISO8859-1) */ ((*Slotp)->Word[0] == WordInfo->Word[0]) && /* next compare the lengths: there are (maxwordlen - minwordlen) * possible values, but they are not uniformly distributed -- * there are many more short words than long words -- so we * check the length only _after_ the first character: */ ((*Slotp)->WordLength == WordInfo->Length) && /* well, if the 1st letter and the length are the same, * compare the last letter, and then the whole word, but * obviously only if it's longer than one char: */ ( WordInfo->Length == 1 || ( ( (*Slotp)->Word[WordInfo->Length] == WordInfo->Word[WordInfo->Length] ) && ( /* We've compared the 1st and last 2 chars, and * if there are more, we'll look at those now: */ WordInfo->Length > 2 || !strcmp((*Slotp)->Word, WordInfo->Word) ) ) ) ) { HashEl = (*Slotp); if (HashEl->PlacesUsed == 0) { if (!HashEl->Places) { HashEl->Places = (t_WordPlace *) emalloc( "SetElEmpty.Places", sizeof(t_WordPlace) * NPLACES ); } ++SlotsUsed; } break; } else if ((*Slotp)->PlacesUsed == 0) { /* We left a hash element here to reduce calls to malloc * and to WordToWid, but someone else got here first... * This can only happen if there is a hash collision, because * if the word doesn't belong here, it gets removed by * DumpFaster(). * */ if (ProbeCount >= 3) { /* OK, it probably isn't in there... * so let's put it here... */ HashEl = (*Slotp); if (HashEl->WordLength < WordInfo->Length || HashEl->WordLength > WordInfo->Length + 3 ) { /* The string is stuck at the end of the struct, * so if the new string is longer, we have to allocate * a new struct. */ (void) efree(HashEl); HashEl = *Slotp = (t_HashEl *) emalloc( "wordtable::AddWord.HashEl", sizeof(t_HashEl) + WordInfo->Length ); HashEl->PlacesUsed = 0; } (void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length); HashEl->Word[WordInfo->Length] = '\0'; /* Note: this is OK because there is 1 extra byte * in the struct, since Word is 1 byte long in the * declaration... so we have room for the \0. * But I am not sure that it is needed. */ HashEl->Places = (t_WordPlace *) emalloc( "SetElEmpty.Places", sizeof(t_WordPlace) * NPLACES ); HashEl->WordLength = (unsigned char) WordInfo->Length; /* If we put a shorter word here than was there before, * we are more likely to call free & malloc again next * time, as we've now forgotten that it was longer. * But we used to do free & malloc _every_ time, so * this is still a win. */ if (FirstTimeRound) { /* No point looking for the WID, it won't be there; * code in DumpCache() will assign a new one. * LQT_WordToWID() will always return zero on a new index. */ HashEl->WID = 0; } else { HashEl->WID = LQT_WordToWID(db, HashEl->Word, WordInfo->Length); } ++SlotsUsed; break; } } if ((Slot += 17) >= HashSize) { Slot &= (HashSize - 1); } /** Note: hashsize is a power of 2, so we can step through it ** in any prime number, but doing so in little chunks is best ** because we don't hit so many pages, but at the same time ** step over clusters. I hope. **/ /** If the hash table is full, no point looking further: **/ if (ProbeCount + 1 >= MAXPROBES) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_DEBUG, "(%d probes didn't find a slot for %s)", ProbeCount, WordInfo->Word ); #endif CurrentFID = WordInfo->FID; DumpCache(db, DUMP_CACHE|DUMP_FAST); /* try again: */ AddWord(db, WordInfo); return; } Slotp = &SymbolTable[Slot]; } OK: /* check to see if there is already an occurrence of this word * for this file: */ /* If we get here, all we need to do is add the WordPlace */ /* Rather than keep around an integer saying how many places we've * allocated, we remember how many we have used (had to do that anyway), * and whenever we're about to use a place that wouldn't have been * allocated, we allocate more. * There is a tradeoff here. The word distribution is exponential, with * most words occuring fewer than 10 times, and a very few words that * occur tens of thousands of times; there are also quite a few in the * low hundreds, usually. So the following code starts off with low * numbers of places, and then starts adding more & more. * The goal is to minimise wasted memory but at the same time to * avoid extra calls to malloc. */ { long newValue = 0; switch (HashEl->PlacesUsed) { case NPLACES: newValue = NPLACES * 2; break; case NPLACES * 2: newValue = NPLACES * 3; break; case NPLACES * 3: newValue = NPLACES * 4; break; case NPLACES * 4: newValue = NPLACES * 6; break; case NPLACES * 6: newValue = NPLACES * 10; break; case NPLACES * 10: newValue = NPLACES * 14; break; case NPLACES * 14: newValue = NPLACES * 18; break; case NPLACES * 18: newValue = NPLACES * 24; break; case NPLACES * 24: newValue = NPLACES * 30; break; case NPLACES * 30: newValue = NPLACES * 40; break; case NPLACES * 40: newValue = NPLACES * 150; break; case NPLACES * 150: newValue = NPLACES * 150 * 2; break; default: if ( HashEl->PlacesUsed > NPLACES * 150 && ((HashEl->PlacesUsed % (NPLACES * 150)) == 0) ) { newValue = HashEl->PlacesUsed + (NPLACES * 150); } } if (newValue) { HashEl->Places = (t_WordPlace *) erealloc( (char *) HashEl->Places, sizeof(t_WordPlace) * newValue ); } } HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace; WordsInCache++; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, "Slot %d Word %s len %d places %d", Slot, SymbolTable[Slot]->Word, WordInfo->Length, SymbolTable[Slot]->PlacesUsed ); } #endif return; } PRIVATE void DumpOneEntry(db, theHashElp, CallFree) t_LQTEXT_Database *db; t_HashEl **theHashElp; int CallFree; { if ((*theHashElp)->PlacesUsed > 0) { /* We are going to make a new index entry for the word. * There are two cases -- depending on whether the word * is already indexed or not. * In the former case we must merge the new information. * In the latter case we don't have to read the old info, * but we must make a new entry in the WID Index. */ WordsInCache -= (*theHashElp)->PlacesUsed; if ((*theHashElp)->PlacesUsed >= LargestNumberOfPlacesInOneSlot) { /* well, it's certainly wrong now, so guess: */ LargestNumberOfPlacesInOneSlot = 1; } if ((*theHashElp)->WID == 0) { NewEntry(db, *theHashElp); } else { UpdateEntry(db, *theHashElp); } --SlotsUsed; (*theHashElp)->PlacesUsed = 0; /* Always free the memory used by the places, since * it can be quite large. */ if ((*theHashElp)->Places) { efree((char *) (*theHashElp)->Places); (*theHashElp)->Places = 0; } /* But don't bother with the * small hash table entry if we're about to exit. */ if (CallFree & DUMP_NOFREE) { return; } /* Actually, we'll free the table entry anyway to save RAM: */ efree(*theHashElp); *theHashElp = 0; /* but leave the word there */ } else { /* It was an empty unused slot, with just a word there. * We probably need the slot, so we should free it. */ if ((*theHashElp)->Places) { efree((char *) (*theHashElp)->Places); (*theHashElp)->Places = 0; } if (CallFree & DUMP_NOFREE) { return; } efree(*theHashElp); *theHashElp = 0; } } /* The problem with DumpOldest is that it has no easy way of detecting * collisions. * * Suppose that we are about to write out the contents of slot N, * containing matches for some word A. * If there was another word B that hashes to the same slot N. * Since slot N was used, the matches for B are in another slot. * Searches for those matches succeed because AddWord first looks * in slot N, then in N + 1 (say), and so on. * But if we delete the word in slot N, AddWord will stop at * slot N when looking for B, and will notice that it's empty, and * assume there was no collision. * Oops. * * DumpFaster is supposed to sort this out, but I need to test * this in conjunction with DumpOldest. */ PRIVATE void DumpOldest(db, CallFree) t_LQTEXT_Database *db; int CallFree; { static t_FID LastCalledFor = 0; register int i; register t_HashEl *HashEl; int Progress = 0; LQT_Trace(LQTRACE_DEBUG, "oldest dump: %d/%d words, %d/%d slots", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); if (LastCalledFor == CurrentFID) { if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { LQT_Trace(LQTRACE_DEBUG, "-- but still in same file"); } return; } LargestNumberOfPlacesInOneSlot = 0; for (i = 0; i != LastEl; i++) { long separation; if (!SymbolTable[i]) { continue; } if (!SymbolTable[i]->PlacesUsed) { continue; } HashEl = SymbolTable[i]; /* The theory is that words that haven't been found in the most * recently added couple of files might never be found again -- * so we might as well get rid of them. * * If the word only occurs once, and not in the current file, we * can also dump it, for the same reason. */ separation = CurrentFID - HashEl->Places[HashEl->PlacesUsed - 1].FID; if (separation > 4 || HashEl->PlacesUsed <= separation) { DumpOneEntry(db, &SymbolTable[i], CallFree); } else if (HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot) { LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed; } if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { if (i >= Progress * (HashSize / 5)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "progress: %d%% %d words/%d slots", Progress * 20, WordsInCache, SlotsUsed ); ++Progress; } } } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "cache now %d/%d, slots %d/%d]", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); FirstTimeRound = 0; } PRIVATE void FasterDump(db, CallFree, MaxAllowedHits) t_LQTEXT_Database *db; int CallFree; int MaxAllowedHits; { register int i; register t_HashEl *HashEl; int Progress = 0; int ReportFrequency = HashSize / 5; int Slot = 0; LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "faster dump nocc > %d, from %d/%d words, %d/%d slots", MaxAllowedHits, WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); /** We go through the hash table an element at a time. ** We clear all slots that contain more than MaxAllowedHits ** entries. ** Other slots are tested to see if they are in the right ** place or not. If not, we see if we can put them in the ** right slot (because it's empty or because we're going to ** dump that slot, so we can make it empty). ** If the entry is in the wrong place and we can't easily ** correct that, we write it out. **/ LargestNumberOfPlacesInOneSlot = 0; for (i = 0; i != LastEl; i++) { int ForceDump; if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { if (i >= Progress * ReportFrequency) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "progress: %d%% %d words/%d slots", Progress * 20, WordsInCache, SlotsUsed ); ++Progress; } } if (!SymbolTable[i]) { continue; } HashEl = SymbolTable[i]; ForceDump = 0; if (HashEl->PlacesUsed <= MaxAllowedHits) { /* We're not going to dump this entry. * If there was a collision (two or more entries have the same * hash code), we mustn't simply delete the entry at the hashed * slot, because then we would no longer find this one. */ Slot = Hash(HashEl->Word, (int) HashEl->WordLength); if (Slot != i) { /* It's in the wrong place... * See if we can put it back: */ if (SymbolTable[Slot] == 0) { /* In this case we can simply put it in the right place*/ SymbolTable[Slot] = HashEl; SymbolTable[i] = 0; continue; } else if (SymbolTable[Slot]->PlacesUsed > MaxAllowedHits) { /* In this case the entry at the other location is * going to be dumped, so we can swap them even though * that puts the other entry temporarily in the wrong * place. We're about to delete it anyway. */ register t_HashEl *tmp = SymbolTable[Slot]; SymbolTable[Slot] = HashEl; HashEl = SymbolTable[i] = tmp; /* Liam: why can we do this? * what if there was a secondary collision with * the word in the other hash slot?? */ } else { /* In this case, it's in the wrong place, and the * entry in the other slot isn't going to be deleted. * But it might be moved. This (we hope) is rare * enough that we handle it by giving up and * dumping it anyway: */ ForceDump = 1; } if (!SymbolTable[i]) { continue; /* dealt with slot i. */ } } } if (HashEl->PlacesUsed > MaxAllowedHits || ForceDump) { #ifdef ASCIITRACE /* ASSERT: HashEl == &SymbolTable[i]; * If not, we don't know which symbol table slot to set to zero * after doing the dump. */ if (HashEl != SymbolTable[i]) { Error(E_BUG, "%s: %d: HashEl 0x%x != SymbolTable[%d], Slot %d", __FILE__, __LINE__, HashEl, i, SymbolTable[i], Slot ); } #endif DumpOneEntry(db, &SymbolTable[i], CallFree); } else if (HashEl && HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot ) { LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed; } } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "cache now %d/%d, slots %d/%d]", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); FirstTimeRound = 0; } PRIVATE void FastDump(db, CallFree) t_LQTEXT_Database *db; int CallFree; { register int i; register t_HashEl *HashEl; int Progress = 0; LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "fast dump nocc < %d, from %d/%d words, %d/%d slots", DumpThresh, WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); /** We go through the hash table an element at a time. ** We clear all slots that contain fewer than DumpThresh ** entries. ** Other slots are tested to see if they are in the right ** place or not. If not, we see if we can put them in the ** right slot (because it's empty or because we're going to ** dump that slot, so we can make it empty). ** If the entry is in the wrong place and we can't easily ** correct that, we write it out. **/ for (i = 0; i != LastEl; i++) { int ForceDump; if (!SymbolTable[i]) continue; HashEl = SymbolTable[i]; ForceDump = 0; if (HashEl->PlacesUsed > DumpThresh) { /* We're not going to dump this entry. * If there was a collision (two or more entries have the same * hash code), we mustn't simply delete the entry at the hashed * slot, because then we would no longer find this one. */ int Slot = Hash(HashEl->Word, (int) HashEl->WordLength); if (Slot != i) { /* It's in the wrong place... * See if we can put it back: */ if (SymbolTable[Slot] == 0) { /* In this case we can simply put it in the right place*/ SymbolTable[Slot] = HashEl; SymbolTable[i] = 0; } else if (SymbolTable[Slot]->PlacesUsed <= DumpThresh) { /* In this case the entry at the other location is * going to be dumped, so we can swap them even though * that puts the other entry temporarily in the wrong * place. We're about to delete it anyway. */ register t_HashEl *tmp = SymbolTable[Slot]; SymbolTable[Slot] = HashEl; HashEl = SymbolTable[i] = tmp; } else { /* In this case, it's in the wrong place, and the * entry in the other slot isn't going to be deleted. * But it might be moved. This (we hope) is rare * enough that we handle it by giving up and * dumping it anyway: */ ForceDump = 1; } if (!SymbolTable[i]) { continue; /* dealt with slot i. */ } } } if (HashEl->PlacesUsed <= DumpThresh || ForceDump) { DumpOneEntry(db, &SymbolTable[i], CallFree); } if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { if (i >= Progress * (HashSize / 5)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "progress: %d%% %d words/%d slots", Progress * 20, WordsInCache, SlotsUsed ); ++Progress; } } } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "cache now %d/%d, slots %d/%d", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); FirstTimeRound = 0; } LIBRARY void DumpCache(db, CallFree) t_LQTEXT_Database *db; int CallFree; { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache write from %d/%d words, %d/%d slots", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); if (DumpThresh == -1) { CallFree |= DUMP_SYNC; } else if ((CallFree & DUMP_SYNC) == 0 || (CallFree & DUMP_FAST) != 0) { int SaveThresh = DumpThresh; int Multiplier = 2; int MaxAllowed = 0; int State = 1; int needFaster = 0; /* ensure that we always do something useful:*/ if (WordsInCache) { DumpOldest(db, CallFree); } if (WordsInCache) { MaxAllowed = LargestNumberOfPlacesInOneSlot / 4; FasterDump(db, CallFree, MaxAllowed); } /* now try and get below the dump threshhold: */ while (MUST_DUMP_MORE( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { switch (State) { case 0: case 5: MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); needFaster = 0; State = 1; break; case 1: if (MUST_FREE_MORE_SLOTS( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { FastDump(db, CallFree); needFaster = 1; DumpThresh *= Multiplier; ++Multiplier; } else { MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); needFaster = 0; } State = 2; break; case 2: case 4: MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); needFaster = 0; State++; break; case 3: FastDump(db, CallFree); needFaster = 1; DumpThresh *= Multiplier; ++Multiplier; State++; } } if (needFaster && WordsInCache) { MaxAllowed /= 2; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); } DumpThresh = SaveThresh; LargestNumberOfPlacesInOneSlot = 1; return; } if (WordsInCache && LargestNumberOfPlacesInOneSlot > 4) { /* Dump the larger entries first, in the hopes that they will get * contiguous blocks appended to their existing entries: */ FasterDump(db, CallFree, (int) (LargestNumberOfPlacesInOneSlot / 4)); LargestNumberOfPlacesInOneSlot = 0; } /* Now dump all remaining entries: */ if (WordsInCache) { FasterDump(db, CallFree, 0); } #ifdef ASCIITRACE if (WordsInCache || SlotsUsed) { Error(E_BUG, "Cache, %d words left, %d slots [%s:%d]", WordsInCache, SlotsUsed, __FILE__, __LINE__ ); } #endif WordsInCache = 0; /* allow other processes a chance to do some reading: */ /* TODO: use LQT_Sync */ (void) LQT_WriteCurrentMaxWID(db); (void) LQT_SyncAndCloseAllKeyValueDatabases(db); (void) LQTpFlushWIDCache(db); (void) LQTp_FlushLastBlockCache(db); (void) LQT_FlushBlockCache(db); /* Closing the database loses us our write access... * so ask for it back again: */ LQT_ObtainWriteAccess(db); LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache now empty"); /* The first time round we don't bother looking up words to see * if they already have a WID, since if the database is new, no * WIDs have been assigned yet. This is a big win if you add a lot * of files in the first go, and a lose otherwise, I suspect. * Anyway, after this time, we'll look WIDS up: */ FirstTimeRound = 0; } PRIVATE void NewEntry(db, HashEl) t_LQTEXT_Database *db; t_HashEl *HashEl; { static t_pblock *pblock = 0; static unsigned int pblockSize = 0; #define MAX_KEEP 2000 /* don't keep it around if it's bigger than this */ register int i, k; t_WordInfo *WordInfo; /* TODO: add MightNeedToSort check */ /* TODO: allow pblock to be reclaimed when memory is low */ /** make a WIDIndex entry and mark it as invalid (NOTDONE) */ /* In order to do this, we must make a "pblock", a structure that * reflects the physical database. This is fairly low-level stuff * for efficiency's sake... * * For even more speed, we keep the last pblock we used around, if it * is not too large. */ { unsigned int newSize = sizeof(t_pblock) + HashEl->PlacesUsed * sizeof(t_WordPlace); if (newSize > pblockSize || !pblockSize) { /* allocate a pblock structure. These are rather devious things, * a structure with an array tacked onto the end. */ if (pblockSize && pblock != (t_pblock *) 0) { efree((char *) pblock); } pblockSize = newSize; pblock = (t_pblock *) emalloc( "wordtable::NewEntry.pblock", pblockSize ); } } pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */ pblock->NumberOfWordPlaces = HashEl->PlacesUsed; /* fill in the WordPlaces */ for (i = k = 0; i < HashEl->PlacesUsed; i++) { pblock->WordPlaces[k] = HashEl->Places[i]; /* struct copy */ ++k; } /* if k is 0, they are all deleted, but that's unaccepatable. * help! */ if (k == 0) { #ifdef ASCIITRACE Error(E_WARN|E_INTERNAL, "no places to add for %s", HashEl->Word); #endif if (pblockSize > MAX_KEEP) { efree((char *) pblock); pblock = 0; pblockSize = 0; } return; } if (HashEl->WID == 0) { HashEl->WID = LQT_WriteWordAndWID( db, HashEl->Word, HashEl->WordLength, LQTp_GetMaxOrAllocateWID(db, 0) ); } /* Now fill in enough of WordInfo to let us use the low-level routines: */ WordInfo = LQT_MakeWordInfo( db, HashEl->WID, HashEl->WordLength, (unsigned char *) HashEl->Word ); pblock->WID = HashEl->WID; WordInfo->Offset = 0L; WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces; /* First, let's make an index entry: */ if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) { (void) LQT_MakeWordInfoBlock(db, WordInfo, pblock); } /** write out the new entry */ if (WordInfo->WordPlacesInHere != pblock->NumberOfWordPlaces) { /* In this case, it didn't all fit into the WID index block: */ (void) LQT_Writepblock(db, WordInfo, pblock); } if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) { Error(E_SYS|E_FATAL, "NewEntry: Couldn't insert \"%s\" in database at 0x%lx", WordInfo->Word, pblock->ChainStart); } /** reclaim storage */ LQT_DestroyWordInfo(db, WordInfo); if (pblockSize > MAX_KEEP) { efree((char *) pblock); pblock = 0; pblockSize = 0; } } PRIVATE void UpdateEntry(db, HashEl) t_LQTEXT_Database *db; t_HashEl *HashEl; { register int i; t_pblock *pblock; t_WordInfo *WordInfo; int MightNeedToSort = 0; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, "UpdateEntry(%s/WID %ld, wordlen %d)", HashEl->Word, HashEl->WID, HashEl->WordLength ); } #endif /** get the old entry */ if (!HashEl->WID || !(WordInfo = LQT_WIDToWordInfo(db, HashEl->WID))) { Error(E_BUG, "Word %s WID %ld went away!", HashEl->Word, HashEl->WID); NewEntry(db, HashEl); return; } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_fprintWordInfo(db, stderr, WordInfo, "UpdateEntry"); } #endif if (WordInfo->Offset) { (void) LQT_AddWordPlaces( db, HashEl->Places, HashEl->WID, WordInfo->Offset, (unsigned long) HashEl->PlacesUsed ); (void) LQT_DestroyWordInfo(db, WordInfo); return; } if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) { pblock = (t_pblock *) 0; } else { pblock = LQT_Getpblock(db, WordInfo); } if (pblock) { pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) + (pblock->NumberOfWordPlaces + HashEl->PlacesUsed) * sizeof(t_WordPlace)); } else { pblock = (t_pblock *) emalloc( "wordtable:NewEntry.pblock", sizeof(t_pblock) + (WordInfo->WordPlacesInHere + HashEl->PlacesUsed) * sizeof(t_WordPlace) ); pblock->NumberOfWordPlaces = 0; if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) { if (WordInfo->WordPlaceStart) { WordInfo->WordPlaces = LQT_GetWordPlaces( db, WordInfo->WID, WordInfo->WordPlaceStart, WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock), 0L, &WordInfo->NumberOfWordPlaces ); } } /* Assert: the wordplaces in WordInfo are sorted */ for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) { pblock->WordPlaces[pblock->NumberOfWordPlaces++] = WordInfo->WordPlaces[i]; /* structure copy */ } } /* delete the old entry from disk */ if (WordInfo->Offset) { /* Remove the old information from disk. * This isn't as bad as it sounds, as it will be at the start * of the freelist, so when we write it out again it will be * in the buffer cache... But it would still be faster to append. */ LQT_DeleteWordPlaces(db, WordInfo->Offset, WordInfo->WID); } pblock->WID = HashEl->WID; WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */ /* Merge the WordPlaces */ /* Assert: we need only compare the last old entry and the * first new one to see if we might need a sort. Note that * there must _be_ entries in pblock, as otherwise we'd have called * NewEntry() and not UpdateEntry(). */ if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >= HashEl->Places[0].FID) { MightNeedToSort = 1; } for (i = 0; i < HashEl->PlacesUsed; i++) { pblock->WordPlaces[pblock->NumberOfWordPlaces++] = HashEl->Places[i]; /* copy the struct: */ /* TODO: call qcmp to check for sorting (actually only need to * check the FIDs of the new entries) */ } if (MightNeedToSort) { LQT_SortWordPlaces(db, pblock->NumberOfWordPlaces, pblock->WordPlaces); } WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces; /* First, let's make an index entry: */ if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) { (void) LQT_MakeWordInfoBlock(db, WordInfo, pblock); } /** write out the new entry */ if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) { /* In this case it all fits into the WID index */ pblock->ChainStart = 0L; } else { (void) LQT_Writepblock(db, WordInfo, pblock); } if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) { Error(E_FATAL|E_SYS, "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx", WordInfo->Word, pblock->ChainStart ); } /** reclaim storage */ if (pblock) { (void) efree((char *)pblock); } (void) LQT_DestroyWordInfo(db, WordInfo); }