/* wordtable.c -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file ../COPYRIGHT for full details.
 */

/* Symbol Table Interface to text retrieval database.
 * Handles both the internal and external indexes.
 *
 * This originally used a linked list.  Converting to a hash table reduced
 * the time to index comp.os.vms from nearly an hour to one and a half
 * minutes...
 *
 * Liam Quin, 1989
 */

/* 
 * $Id: wordtable2.c,v 1.1 1996/08/14 17:03:46 lee Exp $
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <sys/types.h>

#ifdef HAVE_SYSV_FCNTL_H
# include <sys/file.h>
#endif
#ifdef HAVE_FCNTL_H
# include <fcntl.h> /* for O_RDWR wtc */
#endif

#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif

#include "smalldb.h"
#include "fileinfo.h"
#include "wordinfo.h"
#include "wordplace.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"
#include "addfile.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "lqtrace.h"

typedef struct s_HashEl {
    t_WID WID;
    int PlacesUsed;
    t_WordPlace *Places;
} t_HashEl;

struct s_TrieNode;
typedef struct s_TrieNode *t_TrieChildren[256];

typedef struct s_TrieNode {
    t_HashEL *This;
    t_TrieChildren *Children;
    unsigned long ChildCount;
} t_TrieNode;

PRIVATE t_HashEl **
FindInTrie(Word, Length, Parentp)
    char *Word;
    int Length;
    t_TrieNode **Parentp;
{
    if (Length <= 0) {
	Error(E_BUG|E_WARN,
	    "%s: %d: got to the end of the trie!", __FILE__, __LINE__
	);
	return (t_HashEl *) 0;
    } else {
	t_TrieNode **Next;

	if (!*Parentp) {
	    *Parentp = (t_TrieNode *) emalloc(
		"Parent Trie Node", sizeof(t_TrieNode)
	    );
	    (*Parentp)->This = (t_HashEl *) 0;
	    (*Parentp)->Children = (t_TrieNode *) 0;
	    (*Parentp)->ChildCount = (unsigned long) 0;
	}

	if (!(*Parentp)->Children) {
	    (*Parentp)->Children = (t_TrieNode *) ecalloc(
		"Trie Children",
		1,
		sizeof(t_TrieChildren)
	    );
	}

	Next = &(*Parentp)->Children[*Word];
	if (!*Next) {
	    *Next = (t_TrieNode *) emalloc(
		"Trie Node",
		sizeof(t_TrieNode)
	    );
	    (*Next)->This = (t_HashEl *) 0;
	    (*Next)->Children = (t_TrieNode *) 0;
	    (*Next)->ChildCount = (unsigned long) 0;

	    /* The parent now has a new child: */
	    (*Parent)->ChildCount++;
	}

	if (Length == 1) {
	    return &(*Next)->This;
	} else {
	    return FindInTrie(&Word[1], Length - 1, *Next);
	}
    }
}

static t_TrieNode *HashTable = 0;

API void
LQT_AddOccurrenceToIndex(db, WordInfo)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
{
    t_HashEl **Hpp;

    Hpp = FindInTrie(WordInfo->Word, WordInfo->Length, &HashTable);

    /* TODO: add a place */
}

/* dumping the cache */

/* We have to walk the trie.
 * [A] Good candidates for dumping and deleting from the trie are:
 * (1) words that have not been updated since we last dumped the cache;
 * (2) words that have not been updated recently and that occur infrequently
 *
 * [B] Good candidates for dumping but retaining the child node are:
 * (1) words that occur many, many times (e.g. and, the... in English)
 *     We should retain the occurrences for the current file, if there
 *     are not too many of them, so we write them all out in one lump and
 *     avoid the overhead of a repeated 3-byte header...
 *
 * [C] Poor candidates for dumping:
 * (1) words that occurred recently -- chances are that we'll see more
 *     occurrences of them soon.  If there are lots of them, though, they
 *     fall into [B](1) above, and should be dumped.
 *
 * As we go, we trim the trie, deleting empty leaf nodes.
 */
LIBRARY void
DumpCache(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    /* depth-first traversal of the tree, so that we always dump
     * children of a node before the node itself.  That way, when we
     * leave a node, we check if it's empty and, if so, delete it
     */
}


#ifndef HASHSIZ
# define HASHSIZ 32768 /* MUST be a power of two */
#endif /*!HASHSIZ*/

#ifndef MAXWORDSINCACHE
# define MAXWORDSINCACHE  (HASHSIZ * 10)
#endif

int MaxWordsInCache = MAXWORDSINCACHE;

/** System calls and library functions used in this file: **/

/** Lqtext calls */

/** calls within this file **/
PRIVATE void NewEntry(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    t_HashEl *HashEl
#endif
);

PRIVATE void UpdateEntry(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    t_HashEl *HashEl
#endif
);

void DumpCache( /* see ../h/addfile.h */
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    int CallFree
#endif
);

/**/

extern char *progname;

PRIVATE int HashSize = HASHSIZ; /* MUST be a power of two */

API int
SetHashSize(db, theNewSize)
    t_LQTEXT_Database *db;
    int theNewSize;
{
    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"Hash table size changed from %ld to %ld", HashSize, theNewSize
    );
    return HashSize = theNewSize;
}

#define NPLACES 2
#define NPLACESBIGINCR 8
#define NPLACESHUGEINCR 64 /* SPARC: most efficient if this is a power of 2 */
/* This is small to optimise the common case -- by far the majority of
 * words are used less than 10 times.  In the cases where we've gone
 * wrong, well, there'll be a few thousand.  We add slowly until we
 * get to NPLACE * 3, and then we go up in NPLACESBIGINCR lumps.
 */

PRIVATE t_HashEl **SymbolTable;
PRIVATE int LastEl = 0;
PRIVATE int WordsInCache = 0;
PRIVATE int DumpThresh = DUMP_FAST_THRESH;
PRIVATE int SlotsUsed = 0;

void
SetDumpThresh(db, Thresh)
    t_LQTEXT_Database *db;
    int Thresh;
{
    /* Set the threshhold for fast dumping.
     * If a word has less than this many occurrences in the cache, it gets
     * written out.  -1 disables this feature, and 0 uses the default.
     */

    DumpThresh = Thresh;

    if (!DumpThresh) {
	DumpThresh = DUMP_FAST_THRESH;
    }
}

PRIVATE void
InitHash(db)
    t_LQTEXT_Database *db;
{
    long tmp = 1;
    while (tmp < HashSize) {
	tmp <<= 1;
    }
    HashSize = tmp;

    if (HashSize < 1) {
	Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n",
				HashSize, MaxWordsInCache);
    }

    SymbolTable = (t_HashEl **) ecalloc(
	"AddWord symbol table",
	HashSize,
	sizeof(t_HashEl *)
    );
    LastEl = HashSize; /* Used as a sentinel */
#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	    "allocated %ld hash slots for up to %ld words",
	    HashSize,
	    MaxWordsInCache
	);
    }
#endif
}

#ifndef Hash
PRIVATE INLINE int
Hash(Word, Length)
    register char *Word;
    register int Length;
{
    register unsigned long n = 0;

#ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */

#define HASHC	n = *Word++ + 65599 * n

    if (Length > 0) {
	register int loop = (Length + 8 - 1) >> 3;

	switch(Length & (8 - 1)) {
	case 0:	do {
		HASHC;	case 7:	HASHC;
	case 6:	HASHC;	case 5:	HASHC;
	case 4:	HASHC;	case 3:	HASHC;
	case 2:	HASHC;	case 1:	HASHC;
		} while (--loop);
	}

    }
#else /* NODUFF */
    while (Length--) {
	n = *Word++ + 65599 * n;
    }
#endif /* NODUFF */

    /* Since HashSize is a power of two, these are equivalent,
     * but on systems where integer division and modulus are done
     * by the FPU, one may be a lot faster than the other.
     */
    return n & (HashSize - 1);

    /** return n % HashSize; **/
}
#endif /* Hash */

PRIVATE char FirstTimeRound = 1;

PRIVATE t_FID CurrentFID = 0;

static int LargestNumberOfPlacesInOneSlot = 1;

/*extern*/ void
AddWord(db, WordInfo)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
{
    register t_HashEl *HashEl = (t_HashEl *) 0;
    int Slot, FirstSlot;
    static t_FID LastFID = 0;
    int ProbeCount = 0;
    register t_HashEl **Slotp;

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo);
	return;
    }

#ifdef ASCIITRACE
    LQT_Trace(LQTRACE_WORDINFO,
	" <%s/%d, b %d, w %lu f %d s %d> ",
	WordInfo->Word,
	WordInfo->Length,
	WordInfo->WordPlace.BlockInFile,
	WordInfo->WordPlace.WordInBlock,
	WordInfo->WordPlace.Flags,
	WordInfo->WordPlace.StuffBefore
    );
#endif

    if (!LastEl) {
	InitHash();
	if (FirstTimeRound) {
	    /* Special check to save looking up the WIDs first time round */
	    t_WID W = LQT_GetMaxWID(db);

	    if (W == 0L) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "first ever run, allocating WIDs."
		);
		FirstTimeRound = 1; /* actually it's already 1 here */
	    } else {
		FirstTimeRound = 0;
	    }
	}
    } else {

	FirstTimeRound = 0;

	if (MUST_DUMP(WordsInCache, MaxWordsInCache, SlotsUsed, HashSize)) {
	    DumpCache(db, DUMP_CACHE|DUMP_FAST);
	}

	if (WordInfo->WordPlace.FID != LastFID) {
	    if (WordInfo->WordPlace.FID == 0L) {
		Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word);
	    } else if (LastFID != 0) {
		/* quick opportunity to get rid of words that
		 * were only in the previous file -- this takes advantage
		 * of the fact that vocabulary is document-specific,
		 * i.e. you tend to get words used in one file and not
		 * another.
		 */
		CurrentFID = WordInfo->WordPlace.FID;
		if (MUST_FILE_DUMP(
		    WordsInCache,
		    MaxWordsInCache,
		    SlotsUsed,
		    HashSize
		)) {
		    DumpCache(db, DUMP_CACHE|DUMP_FAST);
		}
	    }
	    LastFID = WordInfo->WordPlace.FID;
	}
    }

    FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length);

    Slotp = &SymbolTable[Slot];

    for (ProbeCount = 0; ProbeCount < MAXPROBES; ProbeCount++) {
	if (*Slotp == (t_HashEl *) NULL) {
	    /* make a new element */
	    HashEl = *Slotp = (t_HashEl *) emalloc(
		"wordtable::AddWord.HashEl",
		sizeof(t_HashEl) + WordInfo->Length
	    );
	    HashEl->PlacesUsed = 0;
	    HashEl->Places = (t_WordPlace *) emalloc(
		"SetElEmpty.Places",
		sizeof(t_WordPlace) * NPLACES
	    );
	    (void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length);
	    HashEl->Word[WordInfo->Length] = '\0';
	    HashEl->WordLength = (unsigned char) WordInfo->Length;
	    if (FirstTimeRound) {
		/* No point looking for the WID, it won't be there;
		 * code in DumpCache() will assign a new one.
		 * LQT_WordToWID() will always return zero on a new index.
		 */
		HashEl->WID = 0;
	    } else {
		HashEl->WID = LQT_WordToWID(db, HashEl->Word, WordInfo->Length);
	    }
	    ++SlotsUsed;
	    break;
	} else if (
	    /* first compare the first letter (26 possibilities, not all
	     * equiprobable, but more than 10 likely ones:
	     */
	    ((*Slotp)->Word[0] == WordInfo->Word[0]) &&
	    /* next compare the lengths, (maxwordlen - minwordlen) possible
	     * values, so more likely to be the same:
	     */
	    ((*Slotp)->WordLength == WordInfo->Length) &&
	    /* well, if the 1st letter and the length are the same,
	     * compare the whole word:
	     */
	    !strcmp((*Slotp)->Word, WordInfo->Word)
	) {
	    HashEl = (*Slotp);
	    if (Slot != FirstSlot) {
		if (
		    HashEl->PlacesUsed >
		    SymbolTable[FirstSlot]->PlacesUsed + 2
		) {
		    /* Swap so we find it first time next time round.
		     * this is inefficient if the words occur approximately
		     * alternately with equal frequency.
		     * The +2 requirement stops this happening every time
		     * round even in the worst case.
		     */
		    t_HashEl *tmp = SymbolTable[FirstSlot];
		    HashEl = SymbolTable[FirstSlot] = (*Slotp);
		    *Slotp = tmp;
		    Slotp = &SymbolTable[Slot = FirstSlot];
		}
	    }
	    break;
	}

	if ((Slot += 17) >= HashSize) {
	    Slot &= (HashSize - 1);
	}

	/** Note: hashsize is a power of 2, so we can step through it
	 ** in any prime number, but doing so in little chunks is best
	 ** because we don't hit so many pages, but at the same time
	 ** step over clusters.  I hope.
	 **/

	/** If the hash table is full, no point looking further: **/
	if (ProbeCount + 1 >= MAXPROBES) {
#ifdef ASCIITRACE
	    LQT_Trace(LQTRACE_DEBUG,
		"(%d probes didn't find a slot for %s)",
		ProbeCount, WordInfo->Word
	    );
#endif
	    CurrentFID = WordInfo->FID;
	    DumpCache(db, DUMP_CACHE|DUMP_FAST);
	    Slot = FirstSlot;
	    ProbeCount = 0;
	}
	Slotp = &SymbolTable[Slot];
    }

    /* check to see if there is already an occurrence of this word
     * for this file:
     */

    /* If we get here, all we need to do is add the WordPlace */

    {
	long newValue = 0;

	switch (HashEl->PlacesUsed) {
	case NPLACES:
	    newValue = NPLACES * 2;
	    break;
	case NPLACES * 2:
	    newValue = NPLACES * 3;
	    break;
	case NPLACES * 3:
	    newValue = NPLACESBIGINCR;
	    break;
	case NPLACESBIGINCR:
	    newValue = NPLACESBIGINCR * 2;
	    break;
	case NPLACESBIGINCR * 2:
	    newValue = NPLACESBIGINCR * 3;
	    break;
	case NPLACESBIGINCR * 3:
	    newValue = NPLACESHUGEINCR;
	    break;
	case NPLACESHUGEINCR:
	    newValue = NPLACESHUGEINCR * 2;
	    break;
	case NPLACESHUGEINCR * 2:
	    newValue = NPLACESHUGEINCR * 3;
	    break;
	default:
	    if (
		HashEl->PlacesUsed > NPLACESHUGEINCR &&
		((HashEl->PlacesUsed % NPLACESHUGEINCR) == 0)
	    ) {
		newValue = HashEl->PlacesUsed + NPLACESHUGEINCR;
	    }
	}
	if (newValue) {
	    HashEl->Places = (t_WordPlace *) erealloc(
		(char *) HashEl->Places,
		sizeof(t_WordPlace) * newValue
	    );
	}
    }

    HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace;
    if (HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot) {
	LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed;
    }

    WordsInCache++;

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_Trace(LQTRACE_WORDINFO,
	    "Slot %d Word %s len %d places %d",
	    Slot, SymbolTable[Slot]->Word,
	    WordInfo->Length, SymbolTable[Slot]->PlacesUsed
	);
    }
#endif
    return;
}

PRIVATE void
DumpOneEntry(db, theHashEl, WordLength)
    t_LQTEXT_Database *db;
    t_HashEl *theHashEl;
    int WordLength; /* avoids extra calls to strlen() */
{
    /* We are going to make a new index entry for the word.
     * There are two cases -- depending on whether the word
     * is already indexed or not.
     * In the former case we must merge the new information.
     * In the latter case we don't have to read the old info,
     * but we must make a new entry in the WID Index.
     */

    WordsInCache -= theHashEl->PlacesUsed;

    if (theHashEl->PlacesUsed >= LargestNumberOfPlacesInOneSlot) {
	/* well, it's certainly wrong now, so guess: */
	LargestNumberOfPlacesInOneSlot = 1;
    }

    if (theHashEl->WID == 0) {
	NewEntry(db, theHashEl);
    } else {
	UpdateEntry(db, theHashEl);
    }

    /* Reclaim storage */
    /* Note: no point testing CallFree, as it's always set if
     * we're in here.  There's no point using the partial cache
     * dump routine if you're not going to free the slots, as then
     * you get left with a corrupt symbol table!  It's only useful
     * for the final total cache dump.
     */
    efree((char *) theHashEl->Places);
    efree((char *) theHashEl);
}

PRIVATE void
FastDump(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"fast dump nocc < %d, from %d/%d words, %d/%d slots",
	DumpThresh,
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    /** We go through the hash table an element at a time.
     ** We clear all slots that contain fewer than DumpThresh
     ** entries.
     ** Other slots are tested to see if they are in the right
     ** place or not.  If not, we see if we can put them in the
     ** right slot (because it's empty or because we're going to
     ** dump that slot, so we can make it empty).
     ** If the entry is in the wrong place and we can't easily
     ** correct that, we write it out.
     **/

    for (i = 0; i != LastEl; i++) {
	int ForceDump;

	if (!SymbolTable[i]) continue;

	HashEl = SymbolTable[i];
	ForceDump = 0;

	if (HashEl->PlacesUsed > DumpThresh) {
	    /* We're not going to dump this entry.
	     * If there was a collision (two or more entries have the same
	     * hash code), we mustn't simply delete the entry at the hashed
	     * slot, because then we would no longer find this one.
	     */
	    int Slot = Hash(HashEl->Word, (int) HashEl->WordLength);

	    if (Slot != i) {
		/* It's in the wrong place...
		 * See if we can put it back:
		 */
		if (SymbolTable[Slot] == 0) {
		    /* In this case we can simply put it in the right place*/
		    SymbolTable[Slot] = HashEl;
		    SymbolTable[i] = 0;
		} else if (SymbolTable[Slot]->PlacesUsed <= DumpThresh) {
		    /* In this case the entry at the other location is
		     * going to be dumped, so we can swap them even though
		     * that puts the other entry temporarily in the wrong
		     * place.  We're about to delete it anyway.
		     */
		    register t_HashEl *tmp = SymbolTable[Slot];

		    SymbolTable[Slot] = HashEl;
		    HashEl = SymbolTable[i] = tmp;
		} else {
		    /* In this case, it's in the wrong place, and the
		     * entry in the other slot isn't going to be deleted.
		     * But it might be moved.  This (we hope) is rare
		     * enough that we handle it by giving up and
		     * dumping it anyway:
		     */
		    ForceDump = 1;
		}

		if (!SymbolTable[i]) {
		    continue; /* dealt with slot i. */
		}
	    }
	}

	if (HashEl->PlacesUsed <= DumpThresh || ForceDump) {
	    DumpOneEntry(db, HashEl, HashEl->WordLength);
	    SymbolTable[i] = 0; /* freed by DumpOneEntry */
	    --SlotsUsed;
	}

	if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	    if (i >= Progress * (HashSize / 10)) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "progress: %d%%", Progress * 10
		);
		++Progress;
	    }
	}
    }

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"cache now %d/%d, slots %d/%d",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    FirstTimeRound = 0;
}


LIBRARY void
DumpCache(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"Cache write from %d/%d words, %d/%d slots",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    if (DumpThresh == -1) {
	CallFree |= DUMP_SYNC;
    } else if ((CallFree & DUMP_SYNC) == 0 || (CallFree & DUMP_FAST) != 0) {
	int SaveThresh = DumpThresh;
	int Multiplier = 2;
	int MaxAllowed = LargestNumberOfPlacesInOneSlot / 4;
	int State = 1;

	/* ensure that we always do something useful:*/

#ifdef USE_DUMPOLDEST /* not ready yet */
	DumpOldest(db, CallFree);
#endif
	FasterDump(db, CallFree, MaxAllowed);

	/* now try and get below the dump threshhold: */

	while (MUST_DUMP_MORE(
	    WordsInCache,
	    MaxWordsInCache,
	    SlotsUsed,
	    HashSize
	)) {
	    switch (State) {
	    case 0:
		MaxAllowed /= 4; /* should really use sqrt() */
		FasterDump(db, CallFree, MaxAllowed);
		State = 1;
		break;
	    case 1:
		if (MUST_FREE_MORE_SLOTS(
		    WordsInCache,
		    MaxWordsInCache,
		    SlotsUsed,
		    HashSize
		)) {
		    FastDump(db, CallFree);
		    DumpThresh *= Multiplier;
		    ++Multiplier;
		} else {
		    MaxAllowed /= 4; /* should really use sqrt() */
		    FasterDump(db, CallFree, MaxAllowed);
		}
		State = 2;
		break;
	    case 2:
		MaxAllowed /= 4; /* should really use sqrt() */
		FasterDump(db, CallFree, MaxAllowed);
		State = 0;
		break;
	    }
	}

	DumpThresh = SaveThresh;
	LargestNumberOfPlacesInOneSlot = 1;

	(void) LQTpFlushWIDCache(db);
	return;
    }

    /* Dump the larger entries first, in the hopes that they will get
     * contiguous blocks appended to their existing entries:
     */
    FasterDump(db, CallFree, (int) (LargestNumberOfPlacesInOneSlot / 4));
    LargestNumberOfPlacesInOneSlot = 0;

    /* Now dump all remaining entries: */
    FasterDump(db, CallFree, 0);

#ifdef ASCIITRACE
    if (WordsInCache || SlotsUsed) {
	Error(E_BUG, "Cache, %d words left, %d slots [%s:%d]",
	    WordsInCache, SlotsUsed, __FILE__, __LINE__
	);
    }
#endif
    WordsInCache = 0;


    /* allow other processes a chance to do some reading: */
    /* TODO: use LQT_Sync */
    (void) LQT_WriteCurrentMaxWID(db);
    (void) LQT_SyncAndCloseAllKeyValueDatabases(db);
    (void) LQTpFlushWIDCache(db);
    (void) LQTp_FlushLastBlockCache(db);
    (void) LQT_FlushBlockCache(db);

    /* Closing the database loses us our write access...
     * so ask for it back again:
     */

    LQT_ObtainWriteAccess(db);

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache now empty");

    /* The first time round we don't bother looking up words to see
     * if they already have a WID, since if the database is new, no
     * WIDs have been assigned yet.  This is a big win if you add a lot
     * of files in the first go, and a lose otherwise, I suspect.
     * Anyway, after this time, we'll look WIDS up:
     */
    FirstTimeRound = 0;
}

PRIVATE void
NewEntry(db, HashEl)
    t_LQTEXT_Database *db;
    t_HashEl *HashEl;
{
    t_pblock *pblock = 0;
    register int i, k;
    t_WordInfo *WordInfo;
    /* TODO: add MightNeedToSort check */

    /** make a WIDIndex entry and mark it as invalid (NOTDONE) */

    /* In order to do this, we must make a "pblock", a structure that
     * reflects the physical database.  This is fairly low-level stuff
     * for efficiency's sake...
     */

    /* allocate a pblock structure.  These are rather devious things, a
     * structure with an array tacked onto the end.
     */
    pblock = (t_pblock *) emalloc(
	"wordtable::NewEntry.pblock",
	sizeof(t_pblock) + HashEl->PlacesUsed * sizeof(t_WordPlace)
    );
    
    pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */
    pblock->NumberOfWordPlaces = HashEl->PlacesUsed;

    /* fill in the WordPlaces */
    for (i = k = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[k] = HashEl->Places[i]; /* struct copy */
	++k;
    }

    /* if k is 0, they are all deleted, but that's unaccepatable.
     * help!
     */
    if (k == 0) {
#ifdef ASCIITRACE
	Error(E_WARN|E_INTERNAL, "no places to add for %s", HashEl->Word);
#endif
	efree((char *) pblock);
	return;
    }

    if (HashEl->WID == 0) {
	HashEl->WID = LQT_WriteWordAndWID(
	    db,
	    HashEl->Word,
	    HashEl->WordLength,
	    LQTp_GetMaxOrAllocateWID(db, 0)
	);
    }

    /* Now fill in enough of WordInfo to let us use the low-level routines: */
    WordInfo = LQT_MakeWordInfo(
	db,
	HashEl->WID,
	HashEl->WordLength,
	(unsigned char *) HashEl->Word
    );
    pblock->WID = HashEl->WID;
 
    WordInfo->Offset = 0L;

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */

    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) {
	(void) LQT_MakeWordInfoBlock(db, WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere != pblock->NumberOfWordPlaces) {
	/* In this case, it didn't all fit into the WID index block: */
	(void) LQT_Writepblock(db, WordInfo, pblock);
    }

    if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) {
	Error(E_SYS|E_FATAL,
	    "NewEntry: Couldn't insert \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *) pblock);
    }
    LQT_DestroyWordInfo(db, WordInfo);
}

PRIVATE void
UpdateEntry(db, HashEl)
    t_LQTEXT_Database *db;
    t_HashEl *HashEl;
{
    register int i;
    t_pblock *pblock;
    t_WordInfo *WordInfo;
    int MightNeedToSort = 0;

#ifdef ASCIITRACE
    LQT_Trace(LQTRACE_WORDINFO,
	"UpdateEntry(%s/WID %ld, wordlen %d)",
	HashEl->Word,
	HashEl->WID,
	HashEl->WordLength
    );
#endif

    /** get the old entry */

    if (!HashEl->WID || !(WordInfo = LQT_WIDToWordInfo(db, HashEl->WID))) {
	Error(E_BUG, "Word %s WID %ld went away!", HashEl->Word, HashEl->WID);
	NewEntry(db, HashEl);
	return;
    }

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_fprintWordInfo(db, stderr, WordInfo, "UpdateEntry");
    }
#endif

    if (WordInfo->Offset) {
	(void) LQT_AddWordPlaces(
	    db,
	    HashEl->Places,
	    HashEl->WID,
	    WordInfo->Offset,
	    (unsigned long) HashEl->PlacesUsed
	);
	(void) LQT_DestroyWordInfo(db, WordInfo);
	return;
    }

    if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) {
	pblock = (t_pblock *) 0;
    } else {
	pblock = LQT_Getpblock(db, WordInfo);
    }

    if (pblock) {
	pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
	     (pblock->NumberOfWordPlaces +
				    HashEl->PlacesUsed) * sizeof(t_WordPlace));

    } else {
	pblock = (t_pblock *) emalloc(
	    "wordtable:NewEntry.pblock",
	    sizeof(t_pblock) +
	    (WordInfo->WordPlacesInHere + HashEl->PlacesUsed) *
							sizeof(t_WordPlace)
	);
	pblock->NumberOfWordPlaces = 0;
        if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) {
	    if (WordInfo->WordPlaceStart) {
		WordInfo->WordPlaces = LQT_GetWordPlaces(
		    db,
		    WordInfo->WID,
		    WordInfo->WordPlaceStart,
		    WIDBLOCKSIZE -
			    (WordInfo->WordPlaceStart - WordInfo->DataBlock),
		    0L,
		    &WordInfo->NumberOfWordPlaces
		);
	    }
	}

	/* Assert: the wordplaces in WordInfo are sorted */
	for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) {
	    pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				WordInfo->WordPlaces[i]; /* structure copy */

	}
    }

    /* delete the old entry from disk */
    if (WordInfo->Offset) {
	/* Remove the old information from disk.
	 * This isn't as bad as it sounds, as it will be at the start
	 * of the freelist, so when we write it out again it will be
	 * in the buffer cache...  But it would still be faster to append.
	 */
	LQT_DeleteWordPlaces(db, WordInfo->Offset, WordInfo->WID);
    }

    pblock->WID = HashEl->WID;
    WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */

    /* Merge the WordPlaces */

    /* Assert: we need only compare the last old entry and the
     * first new one to see if we might need a sort.  Note that
     * there must _be_ entries in pblock, as otherwise we'd have called
     * NewEntry() and not UpdateEntry().
     */

    if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >=
				HashEl->Places[0].FID) {
	MightNeedToSort = 1;
    }

    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				HashEl->Places[i]; /* copy the struct: */
	/* TODO: call qcmp to check for sorting (actually only need to
	 * check the FIDs of the new entries)
	 */
    }
    
    if (MightNeedToSort) {
	LQT_SortWordPlaces(db, pblock->NumberOfWordPlaces, pblock->WordPlaces);
    }

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) {
	(void) LQT_MakeWordInfoBlock(db, WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) LQT_Writepblock(db, WordInfo, pblock);
    }
    if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) {
	Error(E_FATAL|E_SYS,
	    "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart
	);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *)pblock);
    }
    (void) LQT_DestroyWordInfo(db, WordInfo);
}