/* wordtable.c -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file ../COPYRIGHT for full details. */ /* Symbol Table Interface to text retrieval database. * Handles both the internal and external indexes. * * This originally used a linked list. Converting to a hash table reduced * the time to index comp.os.vms from nearly an hour to one and a half * minutes... * * Liam Quin, 1989 */ /* * $Id: wordtable2.c,v 1.1 1996/08/14 17:03:46 lee Exp $ */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #include #ifdef HAVE_SYSV_FCNTL_H # include #endif #ifdef HAVE_FCNTL_H # include /* for O_RDWR wtc */ #endif #ifdef HAVE_STRING_H # include #else # include #endif #include "smalldb.h" #include "fileinfo.h" #include "wordinfo.h" #include "wordplace.h" #include "pblock.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" typedef struct s_HashEl { t_WID WID; int PlacesUsed; t_WordPlace *Places; } t_HashEl; struct s_TrieNode; typedef struct s_TrieNode *t_TrieChildren[256]; typedef struct s_TrieNode { t_HashEL *This; t_TrieChildren *Children; unsigned long ChildCount; } t_TrieNode; PRIVATE t_HashEl ** FindInTrie(Word, Length, Parentp) char *Word; int Length; t_TrieNode **Parentp; { if (Length <= 0) { Error(E_BUG|E_WARN, "%s: %d: got to the end of the trie!", __FILE__, __LINE__ ); return (t_HashEl *) 0; } else { t_TrieNode **Next; if (!*Parentp) { *Parentp = (t_TrieNode *) emalloc( "Parent Trie Node", sizeof(t_TrieNode) ); (*Parentp)->This = (t_HashEl *) 0; (*Parentp)->Children = (t_TrieNode *) 0; (*Parentp)->ChildCount = (unsigned long) 0; } if (!(*Parentp)->Children) { (*Parentp)->Children = (t_TrieNode *) ecalloc( "Trie Children", 1, sizeof(t_TrieChildren) ); } Next = &(*Parentp)->Children[*Word]; if (!*Next) { *Next = (t_TrieNode *) emalloc( "Trie Node", sizeof(t_TrieNode) ); (*Next)->This = (t_HashEl *) 0; (*Next)->Children = (t_TrieNode *) 0; (*Next)->ChildCount = (unsigned long) 0; /* The parent now has a new child: */ (*Parent)->ChildCount++; } if (Length == 1) { return &(*Next)->This; } else { return FindInTrie(&Word[1], Length - 1, *Next); } } } static t_TrieNode *HashTable = 0; API void LQT_AddOccurrenceToIndex(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { t_HashEl **Hpp; Hpp = FindInTrie(WordInfo->Word, WordInfo->Length, &HashTable); /* TODO: add a place */ } /* dumping the cache */ /* We have to walk the trie. * [A] Good candidates for dumping and deleting from the trie are: * (1) words that have not been updated since we last dumped the cache; * (2) words that have not been updated recently and that occur infrequently * * [B] Good candidates for dumping but retaining the child node are: * (1) words that occur many, many times (e.g. and, the... in English) * We should retain the occurrences for the current file, if there * are not too many of them, so we write them all out in one lump and * avoid the overhead of a repeated 3-byte header... * * [C] Poor candidates for dumping: * (1) words that occurred recently -- chances are that we'll see more * occurrences of them soon. If there are lots of them, though, they * fall into [B](1) above, and should be dumped. * * As we go, we trim the trie, deleting empty leaf nodes. */ LIBRARY void DumpCache(db, CallFree) t_LQTEXT_Database *db; int CallFree; { /* depth-first traversal of the tree, so that we always dump * children of a node before the node itself. That way, when we * leave a node, we check if it's empty and, if so, delete it */ } #ifndef HASHSIZ # define HASHSIZ 32768 /* MUST be a power of two */ #endif /*!HASHSIZ*/ #ifndef MAXWORDSINCACHE # define MAXWORDSINCACHE (HASHSIZ * 10) #endif int MaxWordsInCache = MAXWORDSINCACHE; /** System calls and library functions used in this file: **/ /** Lqtext calls */ /** calls within this file **/ PRIVATE void NewEntry( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_HashEl *HashEl #endif ); PRIVATE void UpdateEntry( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_HashEl *HashEl #endif ); void DumpCache( /* see ../h/addfile.h */ #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); /**/ extern char *progname; PRIVATE int HashSize = HASHSIZ; /* MUST be a power of two */ API int SetHashSize(db, theNewSize) t_LQTEXT_Database *db; int theNewSize; { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Hash table size changed from %ld to %ld", HashSize, theNewSize ); return HashSize = theNewSize; } #define NPLACES 2 #define NPLACESBIGINCR 8 #define NPLACESHUGEINCR 64 /* SPARC: most efficient if this is a power of 2 */ /* This is small to optimise the common case -- by far the majority of * words are used less than 10 times. In the cases where we've gone * wrong, well, there'll be a few thousand. We add slowly until we * get to NPLACE * 3, and then we go up in NPLACESBIGINCR lumps. */ PRIVATE t_HashEl **SymbolTable; PRIVATE int LastEl = 0; PRIVATE int WordsInCache = 0; PRIVATE int DumpThresh = DUMP_FAST_THRESH; PRIVATE int SlotsUsed = 0; void SetDumpThresh(db, Thresh) t_LQTEXT_Database *db; int Thresh; { /* Set the threshhold for fast dumping. * If a word has less than this many occurrences in the cache, it gets * written out. -1 disables this feature, and 0 uses the default. */ DumpThresh = Thresh; if (!DumpThresh) { DumpThresh = DUMP_FAST_THRESH; } } PRIVATE void InitHash(db) t_LQTEXT_Database *db; { long tmp = 1; while (tmp < HashSize) { tmp <<= 1; } HashSize = tmp; if (HashSize < 1) { Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n", HashSize, MaxWordsInCache); } SymbolTable = (t_HashEl **) ecalloc( "AddWord symbol table", HashSize, sizeof(t_HashEl *) ); LastEl = HashSize; /* Used as a sentinel */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "allocated %ld hash slots for up to %ld words", HashSize, MaxWordsInCache ); } #endif } #ifndef Hash PRIVATE INLINE int Hash(Word, Length) register char *Word; register int Length; { register unsigned long n = 0; #ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */ #define HASHC n = *Word++ + 65599 * n if (Length > 0) { register int loop = (Length + 8 - 1) >> 3; switch(Length & (8 - 1)) { case 0: do { HASHC; case 7: HASHC; case 6: HASHC; case 5: HASHC; case 4: HASHC; case 3: HASHC; case 2: HASHC; case 1: HASHC; } while (--loop); } } #else /* NODUFF */ while (Length--) { n = *Word++ + 65599 * n; } #endif /* NODUFF */ /* Since HashSize is a power of two, these are equivalent, * but on systems where integer division and modulus are done * by the FPU, one may be a lot faster than the other. */ return n & (HashSize - 1); /** return n % HashSize; **/ } #endif /* Hash */ PRIVATE char FirstTimeRound = 1; PRIVATE t_FID CurrentFID = 0; static int LargestNumberOfPlacesInOneSlot = 1; /*extern*/ void AddWord(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { register t_HashEl *HashEl = (t_HashEl *) 0; int Slot, FirstSlot; static t_FID LastFID = 0; int ProbeCount = 0; register t_HashEl **Slotp; if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) { Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo); return; } #ifdef ASCIITRACE LQT_Trace(LQTRACE_WORDINFO, " <%s/%d, b %d, w %lu f %d s %d> ", WordInfo->Word, WordInfo->Length, WordInfo->WordPlace.BlockInFile, WordInfo->WordPlace.WordInBlock, WordInfo->WordPlace.Flags, WordInfo->WordPlace.StuffBefore ); #endif if (!LastEl) { InitHash(); if (FirstTimeRound) { /* Special check to save looking up the WIDs first time round */ t_WID W = LQT_GetMaxWID(db); if (W == 0L) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "first ever run, allocating WIDs." ); FirstTimeRound = 1; /* actually it's already 1 here */ } else { FirstTimeRound = 0; } } } else { FirstTimeRound = 0; if (MUST_DUMP(WordsInCache, MaxWordsInCache, SlotsUsed, HashSize)) { DumpCache(db, DUMP_CACHE|DUMP_FAST); } if (WordInfo->WordPlace.FID != LastFID) { if (WordInfo->WordPlace.FID == 0L) { Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word); } else if (LastFID != 0) { /* quick opportunity to get rid of words that * were only in the previous file -- this takes advantage * of the fact that vocabulary is document-specific, * i.e. you tend to get words used in one file and not * another. */ CurrentFID = WordInfo->WordPlace.FID; if (MUST_FILE_DUMP( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { DumpCache(db, DUMP_CACHE|DUMP_FAST); } } LastFID = WordInfo->WordPlace.FID; } } FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length); Slotp = &SymbolTable[Slot]; for (ProbeCount = 0; ProbeCount < MAXPROBES; ProbeCount++) { if (*Slotp == (t_HashEl *) NULL) { /* make a new element */ HashEl = *Slotp = (t_HashEl *) emalloc( "wordtable::AddWord.HashEl", sizeof(t_HashEl) + WordInfo->Length ); HashEl->PlacesUsed = 0; HashEl->Places = (t_WordPlace *) emalloc( "SetElEmpty.Places", sizeof(t_WordPlace) * NPLACES ); (void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length); HashEl->Word[WordInfo->Length] = '\0'; HashEl->WordLength = (unsigned char) WordInfo->Length; if (FirstTimeRound) { /* No point looking for the WID, it won't be there; * code in DumpCache() will assign a new one. * LQT_WordToWID() will always return zero on a new index. */ HashEl->WID = 0; } else { HashEl->WID = LQT_WordToWID(db, HashEl->Word, WordInfo->Length); } ++SlotsUsed; break; } else if ( /* first compare the first letter (26 possibilities, not all * equiprobable, but more than 10 likely ones: */ ((*Slotp)->Word[0] == WordInfo->Word[0]) && /* next compare the lengths, (maxwordlen - minwordlen) possible * values, so more likely to be the same: */ ((*Slotp)->WordLength == WordInfo->Length) && /* well, if the 1st letter and the length are the same, * compare the whole word: */ !strcmp((*Slotp)->Word, WordInfo->Word) ) { HashEl = (*Slotp); if (Slot != FirstSlot) { if ( HashEl->PlacesUsed > SymbolTable[FirstSlot]->PlacesUsed + 2 ) { /* Swap so we find it first time next time round. * this is inefficient if the words occur approximately * alternately with equal frequency. * The +2 requirement stops this happening every time * round even in the worst case. */ t_HashEl *tmp = SymbolTable[FirstSlot]; HashEl = SymbolTable[FirstSlot] = (*Slotp); *Slotp = tmp; Slotp = &SymbolTable[Slot = FirstSlot]; } } break; } if ((Slot += 17) >= HashSize) { Slot &= (HashSize - 1); } /** Note: hashsize is a power of 2, so we can step through it ** in any prime number, but doing so in little chunks is best ** because we don't hit so many pages, but at the same time ** step over clusters. I hope. **/ /** If the hash table is full, no point looking further: **/ if (ProbeCount + 1 >= MAXPROBES) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_DEBUG, "(%d probes didn't find a slot for %s)", ProbeCount, WordInfo->Word ); #endif CurrentFID = WordInfo->FID; DumpCache(db, DUMP_CACHE|DUMP_FAST); Slot = FirstSlot; ProbeCount = 0; } Slotp = &SymbolTable[Slot]; } /* check to see if there is already an occurrence of this word * for this file: */ /* If we get here, all we need to do is add the WordPlace */ { long newValue = 0; switch (HashEl->PlacesUsed) { case NPLACES: newValue = NPLACES * 2; break; case NPLACES * 2: newValue = NPLACES * 3; break; case NPLACES * 3: newValue = NPLACESBIGINCR; break; case NPLACESBIGINCR: newValue = NPLACESBIGINCR * 2; break; case NPLACESBIGINCR * 2: newValue = NPLACESBIGINCR * 3; break; case NPLACESBIGINCR * 3: newValue = NPLACESHUGEINCR; break; case NPLACESHUGEINCR: newValue = NPLACESHUGEINCR * 2; break; case NPLACESHUGEINCR * 2: newValue = NPLACESHUGEINCR * 3; break; default: if ( HashEl->PlacesUsed > NPLACESHUGEINCR && ((HashEl->PlacesUsed % NPLACESHUGEINCR) == 0) ) { newValue = HashEl->PlacesUsed + NPLACESHUGEINCR; } } if (newValue) { HashEl->Places = (t_WordPlace *) erealloc( (char *) HashEl->Places, sizeof(t_WordPlace) * newValue ); } } HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace; if (HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot) { LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed; } WordsInCache++; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, "Slot %d Word %s len %d places %d", Slot, SymbolTable[Slot]->Word, WordInfo->Length, SymbolTable[Slot]->PlacesUsed ); } #endif return; } PRIVATE void DumpOneEntry(db, theHashEl, WordLength) t_LQTEXT_Database *db; t_HashEl *theHashEl; int WordLength; /* avoids extra calls to strlen() */ { /* We are going to make a new index entry for the word. * There are two cases -- depending on whether the word * is already indexed or not. * In the former case we must merge the new information. * In the latter case we don't have to read the old info, * but we must make a new entry in the WID Index. */ WordsInCache -= theHashEl->PlacesUsed; if (theHashEl->PlacesUsed >= LargestNumberOfPlacesInOneSlot) { /* well, it's certainly wrong now, so guess: */ LargestNumberOfPlacesInOneSlot = 1; } if (theHashEl->WID == 0) { NewEntry(db, theHashEl); } else { UpdateEntry(db, theHashEl); } /* Reclaim storage */ /* Note: no point testing CallFree, as it's always set if * we're in here. There's no point using the partial cache * dump routine if you're not going to free the slots, as then * you get left with a corrupt symbol table! It's only useful * for the final total cache dump. */ efree((char *) theHashEl->Places); efree((char *) theHashEl); } PRIVATE void FastDump(db, CallFree) t_LQTEXT_Database *db; int CallFree; { register int i; register t_HashEl *HashEl; int Progress = 0; LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "fast dump nocc < %d, from %d/%d words, %d/%d slots", DumpThresh, WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); /** We go through the hash table an element at a time. ** We clear all slots that contain fewer than DumpThresh ** entries. ** Other slots are tested to see if they are in the right ** place or not. If not, we see if we can put them in the ** right slot (because it's empty or because we're going to ** dump that slot, so we can make it empty). ** If the entry is in the wrong place and we can't easily ** correct that, we write it out. **/ for (i = 0; i != LastEl; i++) { int ForceDump; if (!SymbolTable[i]) continue; HashEl = SymbolTable[i]; ForceDump = 0; if (HashEl->PlacesUsed > DumpThresh) { /* We're not going to dump this entry. * If there was a collision (two or more entries have the same * hash code), we mustn't simply delete the entry at the hashed * slot, because then we would no longer find this one. */ int Slot = Hash(HashEl->Word, (int) HashEl->WordLength); if (Slot != i) { /* It's in the wrong place... * See if we can put it back: */ if (SymbolTable[Slot] == 0) { /* In this case we can simply put it in the right place*/ SymbolTable[Slot] = HashEl; SymbolTable[i] = 0; } else if (SymbolTable[Slot]->PlacesUsed <= DumpThresh) { /* In this case the entry at the other location is * going to be dumped, so we can swap them even though * that puts the other entry temporarily in the wrong * place. We're about to delete it anyway. */ register t_HashEl *tmp = SymbolTable[Slot]; SymbolTable[Slot] = HashEl; HashEl = SymbolTable[i] = tmp; } else { /* In this case, it's in the wrong place, and the * entry in the other slot isn't going to be deleted. * But it might be moved. This (we hope) is rare * enough that we handle it by giving up and * dumping it anyway: */ ForceDump = 1; } if (!SymbolTable[i]) { continue; /* dealt with slot i. */ } } } if (HashEl->PlacesUsed <= DumpThresh || ForceDump) { DumpOneEntry(db, HashEl, HashEl->WordLength); SymbolTable[i] = 0; /* freed by DumpOneEntry */ --SlotsUsed; } if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { if (i >= Progress * (HashSize / 10)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "progress: %d%%", Progress * 10 ); ++Progress; } } } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "cache now %d/%d, slots %d/%d", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); FirstTimeRound = 0; } LIBRARY void DumpCache(db, CallFree) t_LQTEXT_Database *db; int CallFree; { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache write from %d/%d words, %d/%d slots", WordsInCache, MaxWordsInCache, SlotsUsed, HashSize ); if (DumpThresh == -1) { CallFree |= DUMP_SYNC; } else if ((CallFree & DUMP_SYNC) == 0 || (CallFree & DUMP_FAST) != 0) { int SaveThresh = DumpThresh; int Multiplier = 2; int MaxAllowed = LargestNumberOfPlacesInOneSlot / 4; int State = 1; /* ensure that we always do something useful:*/ #ifdef USE_DUMPOLDEST /* not ready yet */ DumpOldest(db, CallFree); #endif FasterDump(db, CallFree, MaxAllowed); /* now try and get below the dump threshhold: */ while (MUST_DUMP_MORE( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { switch (State) { case 0: MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); State = 1; break; case 1: if (MUST_FREE_MORE_SLOTS( WordsInCache, MaxWordsInCache, SlotsUsed, HashSize )) { FastDump(db, CallFree); DumpThresh *= Multiplier; ++Multiplier; } else { MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); } State = 2; break; case 2: MaxAllowed /= 4; /* should really use sqrt() */ FasterDump(db, CallFree, MaxAllowed); State = 0; break; } } DumpThresh = SaveThresh; LargestNumberOfPlacesInOneSlot = 1; (void) LQTpFlushWIDCache(db); return; } /* Dump the larger entries first, in the hopes that they will get * contiguous blocks appended to their existing entries: */ FasterDump(db, CallFree, (int) (LargestNumberOfPlacesInOneSlot / 4)); LargestNumberOfPlacesInOneSlot = 0; /* Now dump all remaining entries: */ FasterDump(db, CallFree, 0); #ifdef ASCIITRACE if (WordsInCache || SlotsUsed) { Error(E_BUG, "Cache, %d words left, %d slots [%s:%d]", WordsInCache, SlotsUsed, __FILE__, __LINE__ ); } #endif WordsInCache = 0; /* allow other processes a chance to do some reading: */ /* TODO: use LQT_Sync */ (void) LQT_WriteCurrentMaxWID(db); (void) LQT_SyncAndCloseAllKeyValueDatabases(db); (void) LQTpFlushWIDCache(db); (void) LQTp_FlushLastBlockCache(db); (void) LQT_FlushBlockCache(db); /* Closing the database loses us our write access... * so ask for it back again: */ LQT_ObtainWriteAccess(db); LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache now empty"); /* The first time round we don't bother looking up words to see * if they already have a WID, since if the database is new, no * WIDs have been assigned yet. This is a big win if you add a lot * of files in the first go, and a lose otherwise, I suspect. * Anyway, after this time, we'll look WIDS up: */ FirstTimeRound = 0; } PRIVATE void NewEntry(db, HashEl) t_LQTEXT_Database *db; t_HashEl *HashEl; { t_pblock *pblock = 0; register int i, k; t_WordInfo *WordInfo; /* TODO: add MightNeedToSort check */ /** make a WIDIndex entry and mark it as invalid (NOTDONE) */ /* In order to do this, we must make a "pblock", a structure that * reflects the physical database. This is fairly low-level stuff * for efficiency's sake... */ /* allocate a pblock structure. These are rather devious things, a * structure with an array tacked onto the end. */ pblock = (t_pblock *) emalloc( "wordtable::NewEntry.pblock", sizeof(t_pblock) + HashEl->PlacesUsed * sizeof(t_WordPlace) ); pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */ pblock->NumberOfWordPlaces = HashEl->PlacesUsed; /* fill in the WordPlaces */ for (i = k = 0; i < HashEl->PlacesUsed; i++) { pblock->WordPlaces[k] = HashEl->Places[i]; /* struct copy */ ++k; } /* if k is 0, they are all deleted, but that's unaccepatable. * help! */ if (k == 0) { #ifdef ASCIITRACE Error(E_WARN|E_INTERNAL, "no places to add for %s", HashEl->Word); #endif efree((char *) pblock); return; } if (HashEl->WID == 0) { HashEl->WID = LQT_WriteWordAndWID( db, HashEl->Word, HashEl->WordLength, LQTp_GetMaxOrAllocateWID(db, 0) ); } /* Now fill in enough of WordInfo to let us use the low-level routines: */ WordInfo = LQT_MakeWordInfo( db, HashEl->WID, HashEl->WordLength, (unsigned char *) HashEl->Word ); pblock->WID = HashEl->WID; WordInfo->Offset = 0L; WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces; /* First, let's make an index entry: */ if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) { (void) LQT_MakeWordInfoBlock(db, WordInfo, pblock); } /** write out the new entry */ if (WordInfo->WordPlacesInHere != pblock->NumberOfWordPlaces) { /* In this case, it didn't all fit into the WID index block: */ (void) LQT_Writepblock(db, WordInfo, pblock); } if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) { Error(E_SYS|E_FATAL, "NewEntry: Couldn't insert \"%s\" in database at 0x%lx", WordInfo->Word, pblock->ChainStart); } /** reclaim storage */ if (pblock) { (void) efree((char *) pblock); } LQT_DestroyWordInfo(db, WordInfo); } PRIVATE void UpdateEntry(db, HashEl) t_LQTEXT_Database *db; t_HashEl *HashEl; { register int i; t_pblock *pblock; t_WordInfo *WordInfo; int MightNeedToSort = 0; #ifdef ASCIITRACE LQT_Trace(LQTRACE_WORDINFO, "UpdateEntry(%s/WID %ld, wordlen %d)", HashEl->Word, HashEl->WID, HashEl->WordLength ); #endif /** get the old entry */ if (!HashEl->WID || !(WordInfo = LQT_WIDToWordInfo(db, HashEl->WID))) { Error(E_BUG, "Word %s WID %ld went away!", HashEl->Word, HashEl->WID); NewEntry(db, HashEl); return; } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_fprintWordInfo(db, stderr, WordInfo, "UpdateEntry"); } #endif if (WordInfo->Offset) { (void) LQT_AddWordPlaces( db, HashEl->Places, HashEl->WID, WordInfo->Offset, (unsigned long) HashEl->PlacesUsed ); (void) LQT_DestroyWordInfo(db, WordInfo); return; } if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) { pblock = (t_pblock *) 0; } else { pblock = LQT_Getpblock(db, WordInfo); } if (pblock) { pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) + (pblock->NumberOfWordPlaces + HashEl->PlacesUsed) * sizeof(t_WordPlace)); } else { pblock = (t_pblock *) emalloc( "wordtable:NewEntry.pblock", sizeof(t_pblock) + (WordInfo->WordPlacesInHere + HashEl->PlacesUsed) * sizeof(t_WordPlace) ); pblock->NumberOfWordPlaces = 0; if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) { if (WordInfo->WordPlaceStart) { WordInfo->WordPlaces = LQT_GetWordPlaces( db, WordInfo->WID, WordInfo->WordPlaceStart, WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock), 0L, &WordInfo->NumberOfWordPlaces ); } } /* Assert: the wordplaces in WordInfo are sorted */ for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) { pblock->WordPlaces[pblock->NumberOfWordPlaces++] = WordInfo->WordPlaces[i]; /* structure copy */ } } /* delete the old entry from disk */ if (WordInfo->Offset) { /* Remove the old information from disk. * This isn't as bad as it sounds, as it will be at the start * of the freelist, so when we write it out again it will be * in the buffer cache... But it would still be faster to append. */ LQT_DeleteWordPlaces(db, WordInfo->Offset, WordInfo->WID); } pblock->WID = HashEl->WID; WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */ /* Merge the WordPlaces */ /* Assert: we need only compare the last old entry and the * first new one to see if we might need a sort. Note that * there must _be_ entries in pblock, as otherwise we'd have called * NewEntry() and not UpdateEntry(). */ if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >= HashEl->Places[0].FID) { MightNeedToSort = 1; } for (i = 0; i < HashEl->PlacesUsed; i++) { pblock->WordPlaces[pblock->NumberOfWordPlaces++] = HashEl->Places[i]; /* copy the struct: */ /* TODO: call qcmp to check for sorting (actually only need to * check the FIDs of the new entries) */ } if (MightNeedToSort) { LQT_SortWordPlaces(db, pblock->NumberOfWordPlaces, pblock->WordPlaces); } WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces; /* First, let's make an index entry: */ if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) { (void) LQT_MakeWordInfoBlock(db, WordInfo, pblock); } /** write out the new entry */ if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) { /* In this case it all fits into the WID index */ pblock->ChainStart = 0L; } else { (void) LQT_Writepblock(db, WordInfo, pblock); } if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) { Error(E_FATAL|E_SYS, "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx", WordInfo->Word, pblock->ChainStart ); } /** reclaim storage */ if (pblock) { (void) efree((char *)pblock); } (void) LQT_DestroyWordInfo(db, WordInfo); }