/* wordtable.c -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file ../COPYRIGHT for full details.
 */

/* Symbol Table Interface to text retrieval database.
 * Handles both the internal and external indexes.
 *
 * This originally used a linked list.  Converting to a hash table reduced
 * the time to index comp.os.vms from nearly an hour to one and a half
 * minutes...
 *
 * Liam Quin, 1989 and later
 */

/* 
 * $Id: wordtable.c,v 2.42 2001/05/31 03:50:13 liam Exp $
 */

#ifndef lint
 static char *Rcs = "$Id: wordtable.c,v 2.42 2001/05/31 03:50:13 liam Exp $";
#endif

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include <stdio.h>
#include <ctype.h>
#include <sys/types.h>

#ifdef HAVE_SYSV_FCNTL_H
# include <sys/file.h>
#endif
#ifdef HAVE_FCNTL_H
# include <fcntl.h> /* for O_RDWR wtc */
#endif

#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif

#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#else
# include <malloc.h>
#endif


#include "smalldb.h"
#include "fileinfo.h"
#include "wordinfo.h"
#include "wordplace.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"
#include "addfile.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "lqtrace.h"

typedef struct s_HashEl {
    t_WID WID;
    int PlacesUsed;
    t_WordPlace *Places;
    unsigned char WordLength;
    char Word[1];
} t_HashEl;

#ifndef HASHSIZ
# define HASHSIZ 32768 /* MUST be a power of two */
#endif /*!HASHSIZ*/

#ifndef MAXWORDSINCACHE
# define MAXWORDSINCACHE  (HASHSIZ * 10)
#endif

int MaxWordsInCache = MAXWORDSINCACHE;

/** System calls and library functions used in this file: **/

/** Lqtext calls */

/** calls within this file **/
PRIVATE void NewEntry(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    t_HashEl *HashEl
#endif
);

PRIVATE void UpdateEntry(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    t_HashEl *HashEl
#endif
);

void DumpCache( /* see ../h/addfile.h */
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    int CallFree
#endif
);

PRIVATE void FastDump(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    int CallFree
#endif
);

PRIVATE void DumpOldest(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *db,
    int CallFree
#endif
);

/** System calls: */
/** Library Functions: */
/**/

PRIVATE int HashSize = HASHSIZ; /* MUST be a power of two */

API int
SetHashSize(Options, theNewSize)
    t_lqdbOptions *Options;
    int theNewSize;
{
    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"Hash table size changed from %ld to %ld", HashSize, theNewSize
    );
    return HashSize = theNewSize;
}

#define NPLACES 2
#define NPLACESBIGINCR 6
#define NPLACESHUGEINCR (NPLACESBIGINCR * 4)
#define NPLACESMAXINCR (NPLACESHUGEINCR * 5)
/* This is small to optimise the common case -- by far the majority of
 * words are used less than 10 times.  In the cases where we've gone
 * wrong, well, there'll be a few thousand.  We add slowly until we
 * get to NPLACE * 3, and then we go up in NPLACESMAXINCR lumps.
 */

PRIVATE t_HashEl **SymbolTable;
PRIVATE int LastEl = 0;
PRIVATE int WordsInCache = 0;
PRIVATE int DumpThresh = DUMP_FAST_THRESH;
PRIVATE int SlotsUsed = 0;

#ifdef USE_RADIX_MAP_FOR_SHORT_WORDS

typedef struct {
    t_WID WID;
    int PlacesUsed;
    t_WordPlace *Places;
} t_ShortWordHashEl;

PRIVATE int UseShortWordMap = 1;
PRIVATE int SWM_StartCount, SWM_EndCount;
PRIVATE unsigned long SWM_TotalShortPlaces;
PRIVATE t_ShortWordHashEl **ShortWords;
PRIVATE int *OneLetterMap;
PRIVATE int *TwoLetterMap;

#define TwoWordSlot(W) \
    (OneLetterMap[(W)->Word[0]] * SWM_EndCount) + \
    (TwoLetterMap[(W)->Length == 1 ? 0 : (W)->Word[1]])

PRIVATE void
InitialiseOneLetterMap(db)
    t_LQTEXT_Database *db;
{
    register int i;
    int nStart, nWithin;
    int pos;

    if (!UseShortWordMap || OneLetterMap != 0) {
	Error(E_FATAL|E_BUG|E_INTERNAL...
	);
    }

    OneLetterMap = (int *) malloc(sizeof(int) * 256);

    TwoLetterMap = (int *) malloc(sizeof(int) * 256);
    TwoLetterMap[0] = 0; /* for representing 1-char words, see below... */

    /* first, count the number of characters that are allowed at
     * the start of a word and in the middle or end.
     */

    for (nStart = nWithin = i = 0; i < 256; i++) {
	OneLetterMap[i] = TwoLetterMap[i] = 0;

	if (LQT_STARTS_WORD(db, i)) {
	    OneLetterMap[i] = nStart;
	    ++nStart;
	}

	if (LQT_WITHIN_OR_ENDS_WORD(db, i)) {
	    ++nWithin;
	    TwoLetterMap[i] = nWithin;
	}
    }

    /* As well as every character legal within a word, we allow
     * 0 for a terminator.  ASUUMPTION: a null character (not byte) is
     * not allowed within a word
     */
    
    if (nStart > 1000 || nWithin > 1000 ||
	nStart * nWithin > MaxWordsInCache / 4
    ) {
	UseShortWordMap = 0;
	return;
    }

    SWM_StartCount = nStart;

	/* to allow for the 0, so that we can represent 1-character words
	 * with a 0 in the second place, we add 1 to the number of
	 * characters that can end a word:
	 */
    SWM_EndCount = nWithin + 1;

    ShortWords = (t_ShortWordHashEl **) ecalloc(
	"Short Word Table",
	sizeof(t_ShortWordHashEl) * SWM_StartCount *  SWM_EndCount
    );
}
#endif


void
SetDumpThresh(Options, Thresh)
    t_lqdbOptions *Options;
    int Thresh;
{
    /* Set the threshhold for fast dumping.
     * If a word has less than this many occurrences in the cache, it gets
     * written out.  -1 disables this feature, and 0 uses the default.
     */

    DumpThresh = Thresh;

    if (!DumpThresh) {
	DumpThresh = DUMP_FAST_THRESH;
    }
}

PRIVATE void
InitHash(db)
    t_LQTEXT_Database *db;
{
    long tmp = 1;
    while (tmp < HashSize) {
	tmp <<= 1;
    }
    HashSize = tmp;

    if (HashSize < 1) {
	Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n",
				HashSize, MaxWordsInCache);
    }

    SymbolTable = (t_HashEl **) ecalloc(
	"AddWord symbol table",
	HashSize,
	sizeof(t_HashEl *)
    );
    LastEl = HashSize; /* Used as a sentinel */
#ifdef ASCIITRACE
    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"allocated %ld hash slots for up to %ld words",
	HashSize,
	MaxWordsInCache
    );
    LQT_Trace(LQTRACE_DEBUG,
	"Sizes: hash: %ud; hash[2] %ud",
	sizeof(t_HashEl),
	sizeof(t_HashEl[2])
    );
#endif
}

#ifndef Hash
PRIVATE INLINE int
Hash(Word, Length)
    register char *Word;
    register int Length;
{
    register unsigned long n = 0;

#define HASHC	n = *Word++ + 65599 * n

#ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */

    if (Length > 0) {
	register int loop = (Length + 8 - 1) >> 3;

	switch(Length & 07) {
	case 0:	do {
		HASHC;	case 7:	HASHC;
	case 6:	HASHC;	case 5:	HASHC;
	case 4:	HASHC;	case 3:	HASHC;
	case 2:	HASHC;	case 1:	HASHC;
		} while (--loop);
	}

    }
#else /* NODUFF */
    while (Length--) {
	HASHC;
    }
#endif /* NODUFF */

    return n & (HashSize - 1);
}
#endif /* Hash */

PRIVATE char FirstTimeRound = 1;

PRIVATE t_FID CurrentFID = 0;

static int LargestNumberOfPlacesInOneSlot = 1;

/*extern*/ void
AddWord(db, WordInfo)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
{
    register t_HashEl *HashEl = (t_HashEl *) 0;
    int Slot, FirstSlot;
    static t_FID LastFID = 0;
    int ProbeCount = 0;
    register t_HashEl **Slotp;

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo);
	return;
    }

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_Trace(LQTRACE_WORDINFO,
	    " <%s/%d, b %d, w %lu f %d s %d> ",
	    WordInfo->Word,
	    WordInfo->Length,
	    WordInfo->WordPlace.BlockInFile,
	    WordInfo->WordPlace.WordInBlock,
	    WordInfo->WordPlace.Flags,
	    WordInfo->WordPlace.StuffBefore
	);
    }
#endif

    if (!LastEl) {
	InitHash();
	if (FirstTimeRound) {
	    /* Special check to save looking up the WIDs first time round */
	    t_WID W = LQT_GetMaxWID(db);

	    if (W == 0L) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "first ever run, allocating WIDs."
		);
		FirstTimeRound = 1; /* actually it's already 1 here */
	    } else {
		FirstTimeRound = 0;
	    }
	}
    } else {
	if (MUST_DUMP(WordsInCache, MaxWordsInCache, SlotsUsed, HashSize)) {
	    DumpCache(db, DUMP_CACHE|DUMP_FAST);
	}

	if (WordInfo->WordPlace.FID != LastFID) {
	    if (WordInfo->WordPlace.FID == 0L) {
		Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word);
	    } else if (LastFID != 0) {
		/* quick opportunity to get rid of words that
		 * were only in the previous file -- this takes advantage
		 * of the fact that vocabulary is document-specific,
		 * i.e. you tend to get words used in one file and not
		 * another.
		 */
		CurrentFID = WordInfo->WordPlace.FID;
		if (MUST_FILE_DUMP(
		    WordsInCache,
		    MaxWordsInCache,
		    SlotsUsed,
		    HashSize
		)) {
		    DumpCache(db, DUMP_CACHE|DUMP_FAST);
		}
	    }
	    LastFID = WordInfo->WordPlace.FID;
	}
    }

#ifdef USE_RADIX_MAP_FOR_SHORT_WORDS
    /* There are approx. 61 possible first letters, and
     * approx 68 possible 2nd letters in ISO 8859-1 with the default
     * wordrules.  If we avoid hashing for 1 and 2 character strings,
     * which are fairly frequent, we will get a performance improvement.
     * We can also save al ittle on storage, by using a different data
     * structure, and by not storing the 1- or 2- char word.
     *
     * I wish it was practical to do this for 3- and 4-char strings, as
     * "the" and "and" are usually extremely common, and saving a few
     * million hashings can't be bad.
     */
    if (WordInfo->Length <= 2) {
	TODO fixme use both characters!
	FirstSlot = Slot = TwoWordSlot(WordInfo);
	Slotp = &OneLetter[Slot];

	if (!*Slotp) {
	    HashEl = *Slotp = (t_ShortWordHashEl *) emalloc(
		"wordtable::AddWord.TwoWordSlotHashEl",
		sizeof(t_ShortWordHashEl)
	    );
	    HashEl->PlacesUsed = 0;
	    HashEl->Places = (t_WordPlace *) emalloc(
		"SetSWElEmpty.Places",
		sizeof(t_WordPlace) * NPLACES
	    );
	    if (FirstTimeRound) {
		/* No point looking for the WID, it won't be there;
		 * code in DumpCache() will assign a new one.
		 * LQT_WordToWID() will always return zero on a new index.
		 */
		HashEl->WID = 0;
	    } else {
		HashEl->WID = LQT_WordToWID(
		    db, WordInfo->Word, WordInfo->Length
		);
	    }
	}
	HashEl = *Slotp;
	goto OK;
    } else {
	FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length);
	Slotp = &SymbolTable[Slot];
    }
#else
    FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length);
    Slotp = &SymbolTable[Slot];
#endif

    for (ProbeCount = 0; ProbeCount < MAXPROBES; ProbeCount++) {
	if (*Slotp == (t_HashEl *) NULL) {
	    /* make a new element */
	    HashEl = *Slotp = (t_HashEl *) emalloc(
		"wordtable::AddWord.HashEl",
		sizeof(t_HashEl) + WordInfo->Length
	    );
	    HashEl->PlacesUsed = 0;
	    HashEl->Places = (t_WordPlace *) emalloc(
		"SetElEmpty.Places",
		sizeof(t_WordPlace) * NPLACES
	    );
	    (void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length);
	    HashEl->Word[WordInfo->Length] = '\0';
	    HashEl->WordLength = (unsigned char) WordInfo->Length;
	    if (FirstTimeRound) {
		/* No point looking for the WID, it won't be there;
		 * code in DumpCache() will assign a new one.
		 * LQT_WordToWID() will always return zero on a new index.
		 */
		HashEl->WID = 0;
	    } else {
		HashEl->WID = LQT_WordToWID(db, HashEl->Word, WordInfo->Length);
	    }
	    ++SlotsUsed;
	    break;
	} else if (
	    /* first compare the first letter (26 possibilities, not all
	     * equiprobable, but more than 10 likely ones; well, actually
	     * it's more than 26, because of numbers, accented characers
	     * and punctuation -- over 60 characters with ISO8859-1)
	     */
	    ((*Slotp)->Word[0] == WordInfo->Word[0]) &&

	    /* next compare the lengths: there are (maxwordlen - minwordlen)
	     * possible values, but they are not uniformly distributed --
	     * there are many more short words than long words -- so we
	     * check the length only _after_ the first character:
	     */
	    ((*Slotp)->WordLength == WordInfo->Length) &&

	    /* well, if the 1st letter and the length are the same,
	     * compare the last letter, and then the whole word, but
	     * obviously only if it's longer than one char:
	     */
	    (
		WordInfo->Length == 1 || (
		    (
			(*Slotp)->Word[WordInfo->Length] ==
			 WordInfo->Word[WordInfo->Length]
		    ) && (
			/* We've compared the 1st and last 2 chars, and
			 * if there are more, we'll look at those now:
			 */
			WordInfo->Length > 2 ||
			!strcmp((*Slotp)->Word, WordInfo->Word)
		    )
		)
	    )
	) {
	    HashEl = (*Slotp);

	    if (HashEl->PlacesUsed == 0) {
		if (!HashEl->Places) {
		    HashEl->Places = (t_WordPlace *) emalloc(
			"SetElEmpty.Places",
			sizeof(t_WordPlace) * NPLACES
		    );
		}
		++SlotsUsed;
	    }

	    break;
	} else if ((*Slotp)->PlacesUsed == 0) {
	    /* We left a hash element here to reduce calls to malloc
	     * and to WordToWid, but someone else got here first...
	     * This can only happen if there is a hash collision, because
	     * if the word doesn't belong here, it gets removed by
	     * DumpFaster().
	     *
	     */
	    if (ProbeCount >= 3) {
		/* OK, it probably isn't in there...
		 * so let's put it here...
		 */
		HashEl = (*Slotp);
		if (HashEl->WordLength < WordInfo->Length ||
		    HashEl->WordLength > WordInfo->Length + 3
		) {
		    /* The string is stuck at the end of the struct,
		     * so if the new string is longer, we have to allocate
		     * a new struct.
		     */
		    (void) efree(HashEl);
		    HashEl = *Slotp = (t_HashEl *) emalloc(
			"wordtable::AddWord.HashEl",
			sizeof(t_HashEl) + WordInfo->Length
		    );
		    HashEl->PlacesUsed = 0;
		}
		(void) strncpy(&HashEl->Word[0], WordInfo->Word, WordInfo->Length);
		HashEl->Word[WordInfo->Length] = '\0';
		    /* Note: this is OK because there is 1 extra byte
		     * in the struct, since Word is 1 byte long in the
		     * declaration... so we have room for the \0.
		     * But I am not sure that it is needed.
		     */

		HashEl->Places = (t_WordPlace *) emalloc(
		    "SetElEmpty.Places",
		    sizeof(t_WordPlace) * NPLACES
		);
		HashEl->WordLength = (unsigned char) WordInfo->Length;
		    /* If we put a shorter word here than was there before,
		     * we are more likely to call free & malloc again next
		     * time, as we've now forgotten that it was longer.
		     * But we used to do free & malloc _every_ time, so
		     * this is still a win.
		     */

		if (FirstTimeRound) {
		    /* No point looking for the WID, it won't be there;
		     * code in DumpCache() will assign a new one.
		     * LQT_WordToWID() will always return zero on a new index.
		     */
		    HashEl->WID = 0;
		} else {
		    HashEl->WID =
			    LQT_WordToWID(db, HashEl->Word, WordInfo->Length);
		}
		++SlotsUsed;
		break;
	    }
	}

	if ((Slot += 17) >= HashSize) {
	    Slot &= (HashSize - 1);
	}

	/** Note: hashsize is a power of 2, so we can step through it
	 ** in any prime number, but doing so in little chunks is best
	 ** because we don't hit so many pages, but at the same time
	 ** step over clusters.  I hope.
	 **/

	/** If the hash table is full, no point looking further: **/
	if (ProbeCount + 1 >= MAXPROBES) {
#ifdef ASCIITRACE
	    LQT_Trace(LQTRACE_DEBUG,
		"(%d probes didn't find a slot for %s)",
		ProbeCount, WordInfo->Word
	    );
#endif
	    CurrentFID = WordInfo->FID;
	    DumpCache(db, DUMP_CACHE|DUMP_FAST);

	    /* try again: */
	    AddWord(db, WordInfo);
	    return;
	}
	Slotp = &SymbolTable[Slot];
    }

OK:
    /* check to see if there is already an occurrence of this word
     * for this file:
     */

    /* If we get here, all we need to do is add the WordPlace */

    /* Rather than keep around an integer saying how many places we've
     * allocated, we remember how many we have used (had to do that anyway),
     * and whenever we're about to use a place that wouldn't have been
     * allocated, we allocate more.
     * There is a tradeoff here.  The word distribution is exponential, with
     * most words occuring fewer than 10 times, and a very few words that
     * occur tens of thousands of times; there are also quite a few in the
     * low hundreds, usually.  So the following code starts off with low
     * numbers of places, and then starts adding more & more.
     * The goal is to minimise wasted memory but at the same time to
     * avoid extra calls to malloc.
     */
    {
	long newValue = 0;

	switch (HashEl->PlacesUsed) {
	case NPLACES:
	    newValue = NPLACES * 2;
	    break;
	case NPLACES * 2:
	    newValue = NPLACES * 3;
	    break;
	case NPLACES * 3:
	    newValue = NPLACES * 4;
	    break;
	case NPLACES * 4:
	    newValue = NPLACES * 6;
	    break;
	case NPLACES * 6:
	    newValue = NPLACES * 10;
	    break;
	case NPLACES * 10:
	    newValue = NPLACES * 14;
	    break;
	case NPLACES * 14:
	    newValue = NPLACES * 18;
	    break;
	case NPLACES * 18:
	    newValue = NPLACES * 24;
	    break;
	case NPLACES * 24:
	    newValue = NPLACES * 30;
	    break;
	case NPLACES * 30:
	    newValue = NPLACES * 40;
	    break;
	case NPLACES * 40:
	    newValue = NPLACES * 150;
	    break;
	case NPLACES * 150:
	    newValue = NPLACES * 150 * 2;
	    break;
	default:
	    if (
		HashEl->PlacesUsed > NPLACES * 150 &&
		((HashEl->PlacesUsed % (NPLACES * 150)) == 0)
	    ) {
		newValue = HashEl->PlacesUsed + (NPLACES * 150);
	    }
	}

	if (newValue) {
	    HashEl->Places = (t_WordPlace *) erealloc(
		(char *) HashEl->Places,
		sizeof(t_WordPlace) * newValue
	    );
	}
    }

    HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace;

    WordsInCache++;

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_Trace(LQTRACE_WORDINFO,
	    "Slot %d Word %s len %d places %d",
	    Slot, SymbolTable[Slot]->Word,
	    WordInfo->Length, SymbolTable[Slot]->PlacesUsed
	);
    }
#endif
    return;
}

PRIVATE void
DumpOneEntry(db, theHashElp, CallFree)
    t_LQTEXT_Database *db;
    t_HashEl **theHashElp;
    int CallFree;
{
    if ((*theHashElp)->PlacesUsed > 0) {

	/* We are going to make a new index entry for the word.
	 * There are two cases -- depending on whether the word
	 * is already indexed or not.
	 * In the former case we must merge the new information.
	 * In the latter case we don't have to read the old info,
	 * but we must make a new entry in the WID Index.
	 */

	WordsInCache -= (*theHashElp)->PlacesUsed;

	if ((*theHashElp)->PlacesUsed >= LargestNumberOfPlacesInOneSlot) {
	    /* well, it's certainly wrong now, so guess: */
	    LargestNumberOfPlacesInOneSlot = 1;
	}

	if ((*theHashElp)->WID == 0) {
	    NewEntry(db, *theHashElp);
	} else {
	    UpdateEntry(db, *theHashElp);
	}

	--SlotsUsed;

	(*theHashElp)->PlacesUsed = 0;

	/* Always free the memory used by the places, since
	 * it can be quite large.
	 */
	if ((*theHashElp)->Places) {
	    efree((char *) (*theHashElp)->Places);
	    (*theHashElp)->Places = 0;
	}

	/* But don't bother with the
	 * small hash table entry if we're about to exit.
	 */
	if (CallFree & DUMP_NOFREE) {
	    return;
	}

	/* Actually, we'll free the table entry anyway to save RAM: */
	efree(*theHashElp);
	*theHashElp = 0;

	/* but leave the word there */
    } else {
	/* It was an empty unused slot, with just a word there.
	 * We probably need the slot, so we should free it.
	 */
	if ((*theHashElp)->Places) {
	    efree((char *) (*theHashElp)->Places);
	    (*theHashElp)->Places = 0;
	}
	if (CallFree & DUMP_NOFREE) {
	    return;
	}

	efree(*theHashElp);
	*theHashElp = 0;
    }
}

/* The problem with DumpOldest is that it has no easy way of detecting
 * collisions.
 *
 * Suppose that we are about to write out the contents of slot N,
 * containing matches for some word A.
 * If there was another word B that hashes to the same slot N.
 * Since slot N was used, the matches for B are in another slot.
 * Searches for those matches succeed because AddWord first looks
 * in slot N, then in N + 1 (say), and so on.
 * But if we delete the word in slot N, AddWord will stop at
 * slot N when looking for B, and will notice that it's empty, and
 * assume there was no collision.
 * Oops.
 *
 * DumpFaster is supposed to sort this out, but I need to test
 * this in conjunction with DumpOldest.
 */

PRIVATE void
DumpOldest(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    static t_FID LastCalledFor = 0;
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    LQT_Trace(LQTRACE_DEBUG,
	"oldest dump: %d/%d words, %d/%d slots",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    if (LastCalledFor == CurrentFID) {
	if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	    LQT_Trace(LQTRACE_DEBUG, "-- but still in same file");
	}
	return;
    }

    LargestNumberOfPlacesInOneSlot = 0;

    for (i = 0; i != LastEl; i++) {

	long separation;

	if (!SymbolTable[i]) {
	    continue;
	}

	if (!SymbolTable[i]->PlacesUsed) {
	    continue;
	}

	HashEl = SymbolTable[i];

	/* The theory is that words that haven't been found in the most
	 * recently added couple of files might never be found again --
	 * so we might as well get rid of them.
	 *
	 * If the word only occurs once, and not in the current file, we
	 * can also dump it, for the same reason.
	 */
	separation = CurrentFID - HashEl->Places[HashEl->PlacesUsed - 1].FID;
	if (separation > 4 || HashEl->PlacesUsed <= separation) {
	    DumpOneEntry(db, &SymbolTable[i], CallFree);
	} else if (HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot) {
	    LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed;
	}

	if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	    if (i >= Progress * (HashSize / 5)) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "progress: %d%% %d words/%d slots",
		    Progress * 20,
		    WordsInCache,
		    SlotsUsed
		);
		++Progress;
	    }
	}
    }

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"cache now %d/%d, slots %d/%d]",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    FirstTimeRound = 0;
}

PRIVATE void
FasterDump(db, CallFree, MaxAllowedHits)
    t_LQTEXT_Database *db;
    int CallFree;
    int MaxAllowedHits;
{
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;
    int ReportFrequency = HashSize / 5;
    int Slot = 0;

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"faster dump nocc > %d, from %d/%d words, %d/%d slots",
	MaxAllowedHits,
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    /** We go through the hash table an element at a time.
     ** We clear all slots that contain more than MaxAllowedHits
     ** entries.
     ** Other slots are tested to see if they are in the right
     ** place or not.  If not, we see if we can put them in the
     ** right slot (because it's empty or because we're going to
     ** dump that slot, so we can make it empty).
     ** If the entry is in the wrong place and we can't easily
     ** correct that, we write it out.
     **/

    LargestNumberOfPlacesInOneSlot = 0;

    for (i = 0; i != LastEl; i++) {
	int ForceDump;

	if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	    if (i >= Progress * ReportFrequency) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "progress: %d%% %d words/%d slots",
		    Progress * 20,
		    WordsInCache,
		    SlotsUsed
		);
		++Progress;
	    }
	}

	if (!SymbolTable[i]) {
	    continue;
	}

	HashEl = SymbolTable[i];
	ForceDump = 0;

	if (HashEl->PlacesUsed <= MaxAllowedHits) {
	    /* We're not going to dump this entry.
	     * If there was a collision (two or more entries have the same
	     * hash code), we mustn't simply delete the entry at the hashed
	     * slot, because then we would no longer find this one.
	     */
	    Slot = Hash(HashEl->Word, (int) HashEl->WordLength);

	    if (Slot != i) {
		/* It's in the wrong place...
		 * See if we can put it back:
		 */
		if (SymbolTable[Slot] == 0) {
		    /* In this case we can simply put it in the right place*/
		    SymbolTable[Slot] = HashEl;
		    SymbolTable[i] = 0;
		    continue;
		} else if (SymbolTable[Slot]->PlacesUsed > MaxAllowedHits) {
		    /* In this case the entry at the other location is
		     * going to be dumped, so we can swap them even though
		     * that puts the other entry temporarily in the wrong
		     * place.  We're about to delete it anyway.
		     */
		    register t_HashEl *tmp = SymbolTable[Slot];

		    SymbolTable[Slot] = HashEl;
		    HashEl = SymbolTable[i] = tmp;
		    /* Liam: why can we do this?
		     * what if there was a secondary collision with
		     * the word in the other hash slot??
		     */
		} else {
		    /* In this case, it's in the wrong place, and the
		     * entry in the other slot isn't going to be deleted.
		     * But it might be moved.  This (we hope) is rare
		     * enough that we handle it by giving up and
		     * dumping it anyway:
		     */
		    ForceDump = 1;
		}

		if (!SymbolTable[i]) {
		    continue; /* dealt with slot i. */
		}
	    }
	}

	if (HashEl->PlacesUsed > MaxAllowedHits || ForceDump) {
#ifdef ASCIITRACE
	    /* ASSERT: HashEl == &SymbolTable[i];
	     * If not, we don't know which symbol table slot to set to zero
	     * after doing the dump.
	     */
	    if (HashEl != SymbolTable[i]) {
		Error(E_BUG,
		    "%s: %d: HashEl 0x%x != SymbolTable[%d], Slot %d",
		    __FILE__, __LINE__,
		    HashEl,
		    i,
		    SymbolTable[i],
		    Slot
		);
	    }
#endif
	    DumpOneEntry(db, &SymbolTable[i], CallFree);
	} else if (HashEl &&
	    HashEl->PlacesUsed > LargestNumberOfPlacesInOneSlot
	) {
	    LargestNumberOfPlacesInOneSlot = HashEl->PlacesUsed;
	}
    }

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"cache now %d/%d, slots %d/%d]",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    FirstTimeRound = 0;
}

PRIVATE void
FastDump(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"fast dump nocc < %d, from %d/%d words, %d/%d slots",
	DumpThresh,
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    /** We go through the hash table an element at a time.
     ** We clear all slots that contain fewer than DumpThresh
     ** entries.
     ** Other slots are tested to see if they are in the right
     ** place or not.  If not, we see if we can put them in the
     ** right slot (because it's empty or because we're going to
     ** dump that slot, so we can make it empty).
     ** If the entry is in the wrong place and we can't easily
     ** correct that, we write it out.
     **/

    for (i = 0; i != LastEl; i++) {
	int ForceDump;

	if (!SymbolTable[i]) continue;

	HashEl = SymbolTable[i];
	ForceDump = 0;

	if (HashEl->PlacesUsed > DumpThresh) {
	    /* We're not going to dump this entry.
	     * If there was a collision (two or more entries have the same
	     * hash code), we mustn't simply delete the entry at the hashed
	     * slot, because then we would no longer find this one.
	     */
	    int Slot = Hash(HashEl->Word, (int) HashEl->WordLength);

	    if (Slot != i) {
		/* It's in the wrong place...
		 * See if we can put it back:
		 */
		if (SymbolTable[Slot] == 0) {
		    /* In this case we can simply put it in the right place*/
		    SymbolTable[Slot] = HashEl;
		    SymbolTable[i] = 0;
		} else if (SymbolTable[Slot]->PlacesUsed <= DumpThresh) {
		    /* In this case the entry at the other location is
		     * going to be dumped, so we can swap them even though
		     * that puts the other entry temporarily in the wrong
		     * place.  We're about to delete it anyway.
		     */
		    register t_HashEl *tmp = SymbolTable[Slot];

		    SymbolTable[Slot] = HashEl;
		    HashEl = SymbolTable[i] = tmp;
		} else {
		    /* In this case, it's in the wrong place, and the
		     * entry in the other slot isn't going to be deleted.
		     * But it might be moved.  This (we hope) is rare
		     * enough that we handle it by giving up and
		     * dumping it anyway:
		     */
		    ForceDump = 1;
		}

		if (!SymbolTable[i]) {
		    continue; /* dealt with slot i. */
		}
	    }
	}

	if (HashEl->PlacesUsed <= DumpThresh || ForceDump) {
	    DumpOneEntry(db, &SymbolTable[i], CallFree);
	}

	if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) {
	    if (i >= Progress * (HashSize / 5)) {
		LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
		    "progress: %d%% %d words/%d slots",
		    Progress * 20,
		    WordsInCache,
		    SlotsUsed
		);
		++Progress;
	    }
	}
    }

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"cache now %d/%d, slots %d/%d",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    FirstTimeRound = 0;
}


LIBRARY void
DumpCache(db, CallFree)
    t_LQTEXT_Database *db;
    int CallFree;
{
    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG,
	"Cache write from %d/%d words, %d/%d slots",
	WordsInCache,
	MaxWordsInCache,
	SlotsUsed,
	HashSize
    );

    if (DumpThresh == -1) {
	CallFree |= DUMP_SYNC;
    } else if ((CallFree & DUMP_SYNC) == 0 || (CallFree & DUMP_FAST) != 0) {
	int SaveThresh = DumpThresh;
	int Multiplier = 2;
	int MaxAllowed = 0;
	int State = 1;
	int needFaster = 0;

	/* ensure that we always do something useful:*/

	if (WordsInCache) {
	    DumpOldest(db, CallFree);
	}

	if (WordsInCache) {
	    MaxAllowed = LargestNumberOfPlacesInOneSlot / 4;
	    FasterDump(db, CallFree, MaxAllowed);
	}

	/* now try and get below the dump threshhold: */

	while (MUST_DUMP_MORE(
	    WordsInCache,
	    MaxWordsInCache,
	    SlotsUsed,
	    HashSize
	)) {
	    switch (State) {
	    case 0:
	    case 5:
		MaxAllowed /= 4; /* should really use sqrt() */
		FasterDump(db, CallFree, MaxAllowed);
		needFaster = 0;
		State = 1;
		break;
	    case 1:
		if (MUST_FREE_MORE_SLOTS(
		    WordsInCache,
		    MaxWordsInCache,
		    SlotsUsed,
		    HashSize
		)) {
		    FastDump(db, CallFree);
		    needFaster = 1;
		    DumpThresh *= Multiplier;
		    ++Multiplier;
		} else {
		    MaxAllowed /= 4; /* should really use sqrt() */
		    FasterDump(db, CallFree, MaxAllowed);
		    needFaster = 0;
		}
		State = 2;
		break;
	    case 2:
	    case 4:
		MaxAllowed /= 4; /* should really use sqrt() */
		FasterDump(db, CallFree, MaxAllowed);
		needFaster = 0;
		State++;
		break;
	    case 3:
		FastDump(db, CallFree);
		needFaster = 1;
		DumpThresh *= Multiplier;
		++Multiplier;
		State++;
	    }
	}

	if (needFaster && WordsInCache) {
	    MaxAllowed /= 2; /* should really use sqrt() */
	    FasterDump(db, CallFree, MaxAllowed);
	}
	DumpThresh = SaveThresh;
	LargestNumberOfPlacesInOneSlot = 1;

	return;
    }

    if (WordsInCache && LargestNumberOfPlacesInOneSlot > 4) { 
	/* Dump the larger entries first, in the hopes that they will get
	 * contiguous blocks appended to their existing entries:
	 */
	FasterDump(db, CallFree, (int) (LargestNumberOfPlacesInOneSlot / 4));
	LargestNumberOfPlacesInOneSlot = 0;
    }

    /* Now dump all remaining entries: */
    if (WordsInCache) {
	FasterDump(db, CallFree, 0);
    }

#ifdef ASCIITRACE
    if (WordsInCache || SlotsUsed) {
	Error(E_BUG, "Cache, %d words left, %d slots [%s:%d]",
	    WordsInCache, SlotsUsed, __FILE__, __LINE__
	);
    }
#endif
    WordsInCache = 0;


    /* allow other processes a chance to do some reading: */
    /* TODO: use LQT_Sync */
    (void) LQT_WriteCurrentMaxWID(db);
    (void) LQT_SyncAndCloseAllKeyValueDatabases(db);
    (void) LQTpFlushWIDCache(db);
    (void) LQTp_FlushLastBlockCache(db);
    (void) LQT_FlushBlockCache(db);

    /* Closing the database loses us our write access...
     * so ask for it back again:
     */

    LQT_ObtainWriteAccess(db);

    LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "Cache now empty");

    /* The first time round we don't bother looking up words to see
     * if they already have a WID, since if the database is new, no
     * WIDs have been assigned yet.  This is a big win if you add a lot
     * of files in the first go, and a lose otherwise, I suspect.
     * Anyway, after this time, we'll look WIDS up:
     */
    FirstTimeRound = 0;
}

PRIVATE void
NewEntry(db, HashEl)
    t_LQTEXT_Database *db;
    t_HashEl *HashEl;
{
    static t_pblock *pblock = 0;
    static unsigned int pblockSize = 0;
#define MAX_KEEP 2000 /* don't keep it around if it's bigger than this */
    register int i, k;
    t_WordInfo *WordInfo;
    /* TODO: add MightNeedToSort check */
    /* TODO: allow pblock to be reclaimed when memory is low */

    /** make a WIDIndex entry and mark it as invalid (NOTDONE) */

    /* In order to do this, we must make a "pblock", a structure that
     * reflects the physical database.  This is fairly low-level stuff
     * for efficiency's sake...
     *
     * For even more speed, we keep the last pblock we used around, if it
     * is not too large.
     */

    {
	unsigned int newSize =
	    sizeof(t_pblock) + HashEl->PlacesUsed * sizeof(t_WordPlace);

	if (newSize > pblockSize || !pblockSize) {
	    /* allocate a pblock structure.  These are rather devious things,
	     * a structure with an array tacked onto the end.
	     */
	    if (pblockSize && pblock != (t_pblock *) 0) {
		efree((char *) pblock);
	    }
	    pblockSize = newSize;
	    pblock = (t_pblock *) emalloc(
		"wordtable::NewEntry.pblock",
		pblockSize
	    );
	}
    }
    
    pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */
    pblock->NumberOfWordPlaces = HashEl->PlacesUsed;

    /* fill in the WordPlaces */
    for (i = k = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[k] = HashEl->Places[i]; /* struct copy */
	++k;
    }

    /* if k is 0, they are all deleted, but that's unaccepatable.
     * help!
     */
    if (k == 0) {
#ifdef ASCIITRACE
	Error(E_WARN|E_INTERNAL, "no places to add for %s", HashEl->Word);
#endif
	if (pblockSize > MAX_KEEP) {
	    efree((char *) pblock);
	    pblock = 0;
	    pblockSize = 0;
	}
	return;
    }

    if (HashEl->WID == 0) {
	HashEl->WID = LQT_WriteWordAndWID(
	    db,
	    HashEl->Word,
	    HashEl->WordLength,
	    LQTp_GetMaxOrAllocateWID(db, 0)
	);
    }

    /* Now fill in enough of WordInfo to let us use the low-level routines: */
    WordInfo = LQT_MakeWordInfo(
	db,
	HashEl->WID,
	HashEl->WordLength,
	(unsigned char *) HashEl->Word
    );
    pblock->WID = HashEl->WID;
 
    WordInfo->Offset = 0L;

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */

    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) {
	(void) LQT_MakeWordInfoBlock(db, WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere != pblock->NumberOfWordPlaces) {
	/* In this case, it didn't all fit into the WID index block: */
	(void) LQT_Writepblock(db, WordInfo, pblock);
    }

    if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) {
	Error(E_SYS|E_FATAL,
	    "NewEntry: Couldn't insert \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart);
    }

    /** reclaim storage */
    LQT_DestroyWordInfo(db, WordInfo);

    if (pblockSize > MAX_KEEP) {
	efree((char *) pblock);
	pblock = 0;
	pblockSize = 0;
    }
}

PRIVATE void
UpdateEntry(db, HashEl)
    t_LQTEXT_Database *db;
    t_HashEl *HashEl;
{
    register int i;
    t_pblock *pblock;
    t_WordInfo *WordInfo;
    int MightNeedToSort = 0;

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_Trace(LQTRACE_WORDINFO,
	    "UpdateEntry(%s/WID %ld, wordlen %d)",
	    HashEl->Word,
	    HashEl->WID,
	    HashEl->WordLength
	);
    }
#endif

    /** get the old entry */

    if (!HashEl->WID || !(WordInfo = LQT_WIDToWordInfo(db, HashEl->WID))) {
	Error(E_BUG, "Word %s WID %ld went away!", HashEl->Word, HashEl->WID);
	NewEntry(db, HashEl);
	return;
    }

#ifdef ASCIITRACE
    if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) {
	LQT_fprintWordInfo(db, stderr, WordInfo, "UpdateEntry");
    }
#endif

    if (WordInfo->Offset) {
	(void) LQT_AddWordPlaces(
	    db,
	    HashEl->Places,
	    HashEl->WID,
	    WordInfo->Offset,
	    (unsigned long) HashEl->PlacesUsed
	);
	(void) LQT_DestroyWordInfo(db, WordInfo);
	return;
    }

    if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) {
	pblock = (t_pblock *) 0;
    } else {
	pblock = LQT_Getpblock(db, WordInfo);
    }

    if (pblock) {
	pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
	     (pblock->NumberOfWordPlaces +
				    HashEl->PlacesUsed) * sizeof(t_WordPlace));

    } else {
	pblock = (t_pblock *) emalloc(
	    "wordtable:NewEntry.pblock",
	    sizeof(t_pblock) +
	    (WordInfo->WordPlacesInHere + HashEl->PlacesUsed) *
							sizeof(t_WordPlace)
	);
	pblock->NumberOfWordPlaces = 0;
        if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) {
	    if (WordInfo->WordPlaceStart) {
		WordInfo->WordPlaces = LQT_GetWordPlaces(
		    db,
		    WordInfo->WID,
		    WordInfo->WordPlaceStart,
		    WIDBLOCKSIZE -
			    (WordInfo->WordPlaceStart - WordInfo->DataBlock),
		    0L,
		    &WordInfo->NumberOfWordPlaces
		);
	    }
	}

	/* Assert: the wordplaces in WordInfo are sorted */
	for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) {
	    pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				WordInfo->WordPlaces[i]; /* structure copy */

	}
    }

    /* delete the old entry from disk */
    if (WordInfo->Offset) {
	/* Remove the old information from disk.
	 * This isn't as bad as it sounds, as it will be at the start
	 * of the freelist, so when we write it out again it will be
	 * in the buffer cache...  But it would still be faster to append.
	 */
	LQT_DeleteWordPlaces(db, WordInfo->Offset, WordInfo->WID);
    }

    pblock->WID = HashEl->WID;
    WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */

    /* Merge the WordPlaces */

    /* Assert: we need only compare the last old entry and the
     * first new one to see if we might need a sort.  Note that
     * there must _be_ entries in pblock, as otherwise we'd have called
     * NewEntry() and not UpdateEntry().
     */

    if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >=
				HashEl->Places[0].FID) {
	MightNeedToSort = 1;
    }

    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				HashEl->Places[i]; /* copy the struct: */
	/* TODO: call qcmp to check for sorting (actually only need to
	 * check the FIDs of the new entries)
	 */
    }
    
    if (MightNeedToSort) {
	LQT_SortWordPlaces(db, pblock->NumberOfWordPlaces, pblock->WordPlaces);
    }

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock(db)) {
	(void) LQT_MakeWordInfoBlock(db, WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) LQT_Writepblock(db, WordInfo, pblock);
    }
    if (LQT_PutWordInfoIntoIndex(db, WordInfo, pblock->ChainStart) < 0) {
	Error(E_FATAL|E_SYS,
	    "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart
	);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *)pblock);
    }
    (void) LQT_DestroyWordInfo(db, WordInfo);
}