/* wordinfo.c -- Copyright 1989-2001 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* wordinfo.c -- handle the database of words for lq-text. * * lq-text keeps a master list of all of the words that have ever been * seen. Currently, this is in dbm format (sdbm or ndbm or db or...). * The master list gives us a long (t_WID actually) for each word. * This is then used as an index into a WordIndex file. * * $Id: wordinfo.c,v 2.50 2001/05/30 20:40:52 liam Exp $ * */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_STRING_H #include #else #include #endif #include #include "fileinfo.h" #include "smalldb.h" #include "wordindex.h" #include "wordinfo.h" #include "numbers.h" #include "emalloc.h" #include "wordrules.h" /* max word length */ #include "pblock.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" #ifndef WIDS_IN_ONE_CACHEBLOCK # define WIDS_IN_ONE_CACHEBLOCK 256 #endif #ifndef CACHEDWIDBLOCKS # define CACHEDWIDBLOCKS 1024 /* TODO: * The WID block cache size should be dymaically variable. * * We should register a memory handler to free the cache * on low memory. * * I need to measure the effectiveness of the cache policy; * maybe the linear access searching should not invalidate * the cache. */ #endif typedef struct { t_WID firstWID; unsigned char theData[WIDS_IN_ONE_CACHEBLOCK * WIDBLOCKSIZE]; char IsDirty; } t_WIDIndexCacheEntry; /** declarations: **/ /** Unix system calls that need to be declared: **/ /** Unix Library Calls that need to be declared: **/ /** lqtext Library calls that need to be declared: **/ /** Functions within this file that need to be declared: **/ PRIVATE t_WIDIndexCacheEntry *GetWIDCacheEntry( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WID WID #endif ); /** **/ #define ui(x) ((unsigned int) x) #define UL(x) ((unsigned long) x) static int Widfd = (-1); static long WidPos = 0L; t_WIDIndexCacheEntry **WIDCache = 0; static short NextFreeCacheEntry = -1; /* to implement fifo; LRU would be better, though, I suspect */ PRIVATE void OpenWordIndexFile(db) t_LQTEXT_Database *db; { int Flags, Modes; LQT_GetFileModes(db, &Flags, &Modes); if (!WIDCache) { register int i; WIDCache = (t_WIDIndexCacheEntry **) emalloc( "WID Cache", sizeof(t_WIDIndexCacheEntry *) * CACHEDWIDBLOCKS ); for (i = 0; i < CACHEDWIDBLOCKS; i++) { WIDCache[i] = (t_WIDIndexCacheEntry *) 0; } } Widfd = LQU_Eopen( E_FATAL|E_SYS, db->WidIndexFile, "WID (Word Identifier) Index file", Flags, Modes ); WidPos = 0L; (void) LQT_GetMaxWID(db); } PRIVATE void LQT_WriteWordInfoIndexBlock( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WID WID, unsigned char *DataBlock #endif ); LIBRARY void LQT_PrintBlock( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WID WID, unsigned char **q, unsigned char **Block, int *BlockLength, long *NextOffset #endif ); PRIVATE void LQTpLoadCacheEntry(db, WID, cacheEntry) t_LQTEXT_Database *db; t_WID WID; t_WIDIndexCacheEntry *cacheEntry; { long theStart = WID / WIDS_IN_ONE_CACHEBLOCK; int i; theStart *= WIDS_IN_ONE_CACHEBLOCK; cacheEntry->firstWID = theStart; cacheEntry->IsDirty = 0; theStart *= WIDBLOCKSIZE; /* convert from items to bytes */ if (!WID) { Error(E_BUG, "%s: %d: LQTpLoadCacheEntry(0L,...) illegal", __FILE__, __LINE__ ); } if (Widfd < 0) { OpenWordIndexFile(db); } if (WidPos != theStart) { WidPos = theStart; (void) LQU_Elseek( E_FATAL, db->WidIndexFile, "lq-text word index/LQTpLoadCacheEntry", Widfd, WidPos, SEEK_SET /* = 0 */ ); } i = read(Widfd, cacheEntry->theData, WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK); if (i < 0) { Error(E_SYS|E_BUG|E_FATAL, "Tried to read %d bytes from %d=\"%s\", but got %d", WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK, Widfd, db->WidIndexFile, i ); WidPos = -1L; /*NOTREACHED*/ return; } else if (i != WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK) { (void) bzero( &cacheEntry->theData[i], (WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK) - i ); } WidPos += i; return; } PRIVATE void LQT_WriteWordInfoIndexBlock(theDb, theWID, DataBlock) t_LQTEXT_Database *theDb; t_WID theWID; unsigned char *DataBlock; { t_WIDIndexCacheEntry *cacheEntry = GetWIDCacheEntry(theDb, theWID); unsigned char *p = &cacheEntry->theData[ WIDBLOCKSIZE * (theWID - cacheEntry->firstWID) ]; (void) bcopy(DataBlock, p, WIDBLOCKSIZE); cacheEntry->IsDirty = 1; } PRIVATE void LQTpWriteCacheEntry(db, cacheEntry) t_LQTEXT_Database *db; t_WIDIndexCacheEntry *cacheEntry; { long theStart = cacheEntry->firstWID * WIDBLOCKSIZE; int i; if (!cacheEntry->IsDirty) { return; } if (Widfd < 0) { Error(E_BUG|E_INTERNAL|E_WARN, "Widfd was < 0 in LQTpWriteCacheEntry for %ld", cacheEntry->firstWID ); OpenWordIndexFile(db); } if (WidPos != theStart) { WidPos = theStart; (void) LQU_Elseek( E_FATAL, db->WidIndexFile, "lq-text word index/LQTpWriteCacheEntry", Widfd, WidPos, SEEK_SET /* = 0 */ ); } i = write(Widfd, cacheEntry->theData, WIDBLOCKSIZE*WIDS_IN_ONE_CACHEBLOCK); if (i != WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK) { Error(E_SYS|E_BUG|E_FATAL, "Tried to write %d bytes from %d=\"%s\", but wrote %d", WIDBLOCKSIZE * WIDS_IN_ONE_CACHEBLOCK, Widfd, db->WidIndexFile, i ); WidPos = -1L; /*NOTREACHED*/ return; } WidPos += i; cacheEntry->IsDirty = 0; /* but leave the rest of the data untouched, so that it's still there * if we need it again. */ return; } /* * LQTpFlushWIDCache * Database/Files * *

Writes any pending entries in the WID file cache out to disk. * This must be done before closing the database or exiting the running * program if any changes have been made.

*

When a database is opened, LQTpFlushWIDCache is registered as an * action to be performed on an LQT_CloseDatabase or LQT_SyncDatabase, * so it should not be necessary to call this function directly.

*

The ignored argument is for compatibility with LQT_AddActionOnClose, * as is the return value. * * LQT_SyncDatabase * LQT_CloseDatabase * LQT_AddActionOnClose * */ LIBRARY int LQTpFlushWIDCache(db) t_LQTEXT_Database *db; { register int c; if (!WIDCache) { return 0; } for (c = 0; c < CACHEDWIDBLOCKS; c++) { if (WIDCache[c]) { LQTpWriteCacheEntry(db, WIDCache[c]); } } return 0; } PRIVATE t_WIDIndexCacheEntry * GetWIDCacheEntry(db, WID) t_LQTEXT_Database *db; t_WID WID; { register int i; int useThisOne = (-1); if (Widfd < 0) { OpenWordIndexFile(db); /* This also creates the empty cache. */ useThisOne = 0; } else { for (i = 0; i < CACHEDWIDBLOCKS; i++) { if (WIDCache[i]) { if (WIDCache[i]->firstWID <= WID && WID < WIDCache[i]->firstWID + WIDS_IN_ONE_CACHEBLOCK) { return WIDCache[i]; } } else { /* In this case the cache isn't full yet */ if (useThisOne < 0) { useThisOne = i; } } } } if (useThisOne < 0) { for (i = 0; i < CACHEDWIDBLOCKS; i++) { if (useThisOne < 0 && !(WIDCache[i]->IsDirty)) { useThisOne = i; } } } if (useThisOne < 0) { ++NextFreeCacheEntry; useThisOne = (NextFreeCacheEntry %= CACHEDWIDBLOCKS); } if (!WIDCache[useThisOne]) { WIDCache[useThisOne] = (t_WIDIndexCacheEntry *) emalloc( "Wid Cache Entry", sizeof(t_WIDIndexCacheEntry) ); WIDCache[useThisOne]->IsDirty = 0; } else if (WIDCache[useThisOne]->IsDirty) { LQTpWriteCacheEntry(db, WIDCache[useThisOne]); } LQTpLoadCacheEntry(db, WID, WIDCache[useThisOne]); return WIDCache[useThisOne]; } PRIVATE unsigned char * ReadWIDIndexBlock(db, theWID) t_LQTEXT_Database *db; t_WID theWID; { t_WIDIndexCacheEntry *cacheEntry; cacheEntry = GetWIDCacheEntry(db, theWID); return &cacheEntry->theData[WIDBLOCKSIZE * (theWID - cacheEntry->firstWID)]; } PRIVATE void MarkWIDCacheEntryAsDirty(cacheEntry) t_WIDIndexCacheEntry *cacheEntry; { cacheEntry->IsDirty = 1; } static int CompareWordInBlockWithPrefix(WID, Block, Lengthp, Prefix, PrefixLength) t_WID WID; unsigned char *Block; unsigned long *Lengthp; unsigned char *Prefix; int PrefixLength; { unsigned char *q = Block; /* read the word length */ if (LQT_sReadNumber( &q, Lengthp, Block, WIDBLOCKSIZE) < 0 || *Lengthp == 0 ) { Error(E_WARN, "CompareWordInBlockWithPrefix: db corrupt, WID %lu has wordlen %ld", WID, *Lengthp ); return LQT_WIDMATCH_FAILED; } /* so now we can compare the words */ return strncmp( q, Prefix, (*Lengthp > PrefixLength) ? PrefixLength : *Lengthp ); } /* * LQT_FindFirstWIDMatchingPrefix * Database/Retrieval, Database/Words * *

Returns the lowest WID whose word matches the given Prefix.

*

The Prefix need not be nul-terminated; the given PrefixLength * argument is used to find the end of the Prefix.

* * The WID on success, and zero on failure. * * Warns if a database format error is detected. * * LQT_WordToWID *
*/ API t_WID LQT_FindFirstWIDMatchingPrefix(db, Prefix, PrefixLength) t_LQTEXT_Database *db; char *Prefix; int PrefixLength; { /* Note; it would be most efficient to look inside the * cache first for the right entry! */ t_WID LeastPossible = 1; t_WID BiggestPossible = LQT_GetMaxWID(db); int triedRangeOfOne = 0; if (!db->WordsInWordIndex) { Error(E_FATAL, "LQT_FindFirstWIDMatchingPrefix: no index: wordlist is off in README" ); } while (BiggestPossible >= LeastPossible) { unsigned char *Buffer; t_WID CurrentGuess; unsigned long L; int i; if (BiggestPossible == LeastPossible) { if (triedRangeOfOne++ > 0) { /* don't get stuck in a loop */ break; } } CurrentGuess = (BiggestPossible + LeastPossible) / 2; if (CurrentGuess > BiggestPossible) { break; } Buffer = ReadWIDIndexBlock(db, CurrentGuess); if (!Buffer) { /*CANTHAPPEN, as they say...*/ Error(E_WARN|E_INTERNAL, "LQT_FindFirstWIDMatchingPrefix: ReadWIDIndexBlock %lu -> 0", CurrentGuess ); return (t_WID) 0; } i = CompareWordInBlockWithPrefix(CurrentGuess, Buffer, &L, Prefix, PrefixLength); if (i == LQT_WIDMATCH_FAILED) { return 0; } if (i < 0) { /* Guess < Prefix, so we need to move upwards */ LeastPossible = CurrentGuess + 1; } else if (i > 0) { BiggestPossible = CurrentGuess; } else { /* the prefix matched... * We now have to ensure that we have found * the least possible match, and not just the middle * of a sequence. */ if (L < PrefixLength) { /* In this case, we haven't actually found a match, * because the word in the index was shorter than * what we're looking for. */ LeastPossible = CurrentGuess + 1; } else if (L == PrefixLength) { return CurrentGuess; } else { /* L > PrefixLength */ if (LeastPossible == CurrentGuess) { return CurrentGuess; } BiggestPossible = CurrentGuess; } } } return (t_WID) 0; } PRIVATE int CompareWordInBlockWithPattern( db, WID, Buffer, Pattern, PatternLength, PrefixLength, Matcher, Argument ) t_LQTEXT_Database *db; t_WID WID; unsigned char *Buffer; unsigned char *Pattern; int PatternLength; int PrefixLength; int (* Matcher)( # ifdef HAVE_PROTO t_LQTEXT_Database *mydb, unsigned char *myString, int myStringLength, unsigned char *myPattern, int myPatternLength, int myPrefixLength, unsigned char *myArgument # endif ); unsigned char *Argument; { unsigned char *q = Buffer; unsigned long L; /* read the word length */ if (LQT_sReadNumber(&q, &L, Buffer, WIDBLOCKSIZE) < 0 || L == 0) { Error(E_WARN, "CompareWordInBlockWithPattern: db corrupt, WID %lu has wordlen %ld", WID, L ); return LQT_WIDMATCH_FAILED; } /* so now we can compare the words */ { int i; i = (* Matcher) ( db, q, (int) L, Pattern, PatternLength, PrefixLength, Argument ); return i; } } /* * LQT_FindFirstWIDMatchingPattern * Database/Retrieval, Database/Words * *

Returns the lowest WID whose word matches the given Pattern.

*

The Pattern need not be NUL-terminated; the given PatternLength * argument is used to find the end of the Pattern.

*

The given PrefixLength argument must specify the number of * leading characters, if any, in the given Pattern that form a * constant prefix. * If there are no such characters, matching is likely to be several * orders of magnitude slower, as LQT_FindFirstWIDMatchingPattern * will have to try every word in the database vocabulary, one at * a time, until it finds one that matches. *

The given Matcher argument must be a pointer to a function * that will try to match the string to the given pattern, * and that will return zero only on a match. The constant * LQT_WIDMATCH_FAILED is available in to be returned * by the given Matcher function, indicating that * LQT_FindFirstWIDMatchingPattern should fail and return zero * immediately. This might be used if the given Matcher function is * called with a string lexically greater than the largest that * could ever match it, or after reporting an error. *

The given Argument is passed on to the Matcher function, for the * convenience of the caller.

* * The WID on success, and zero on failure. * * Warns if a database format error is detected. * * LQT_WordToWID *
*/ API t_WID LQT_FindFirstWIDMatchingPattern( db, Pattern, PatternLength, PrefixLength, Matcher, Argument ) t_LQTEXT_Database *db; unsigned char *Pattern; int PatternLength; int PrefixLength; int (* Matcher)( # ifdef HAVE_PROTO t_LQTEXT_Database *thedb, unsigned char *theString, int theStringLength, unsigned char *thePattern, int thePatternLength, int thePrefixLength, unsigned char *theArgument # endif ); unsigned char *Argument; { /* Note; it would be most efficient to look inside the * cache first for the right entry! */ t_WID CurrentGuess; t_WID BiggestPossible; if (!db->WordsInWordIndex) { /* TODO: need a macro to hide this structure, it's supposed * to be opaque */ /* API Compatibility: can we return zero here? I think so. */ return 0; Error(E_WARN, /* TODO: put actual config file name here */ "LQT_FindFirstWIDMatchingPrefix called, but the vocabulary is not stored in the database (wordlist is set to off)" ); } BiggestPossible = LQT_GetMaxWID(db); if (PrefixLength) { CurrentGuess = LQT_FindFirstWIDMatchingPrefix( db, (char *) Pattern, PrefixLength ); if (!CurrentGuess) { return (t_WID) 0; } } else { CurrentGuess = 1; } for (; CurrentGuess < BiggestPossible; CurrentGuess++) { int i; unsigned char *Buffer; unsigned long L; Buffer = ReadWIDIndexBlock(db, CurrentGuess); if (!Buffer) { /*CANTHAPPEN, as they say...*/ Error(E_WARN|E_INTERNAL, "LQT_FindFirstWIDMatchingPrefix: ReadWIDIndexBlock %lu -> 0", CurrentGuess ); return (t_WID) 0; } if (PrefixLength) { i = CompareWordInBlockWithPrefix( CurrentGuess, Buffer, &L, Pattern, PrefixLength ); if (i > 0) { /* gone too far, no match */ return (t_WID) 0; } else if (i < 0) { continue; /* not there yet */ } } i = CompareWordInBlockWithPattern( db, CurrentGuess, Buffer, Pattern, PatternLength, PrefixLength, Matcher, Argument ); if (i == 0) { return CurrentGuess; } else if (i == LQT_WIDMATCH_FAILED) { return (t_WID) 0; } } /* for */ return (t_WID) 0; } /* * LQT_FindNextWIDMatchingPattern * Database/Retrieval, Database/Words * *

Returns the lowest WID whose word matches the given pattern, * and that is greater than the given WID argument. * The pattern is a string, which must be an all-lower-case prefix. * The given wildcard character must be either * or ?, to indicate * zero or more following characters or exactly one following character, * respectively.

*

The Prefix need not be nul-terminated; the given PrefixLength * argument is used to find the end of the prefix.

* * The WID on success, and zero on failure. * * Warns if a database format error is detected. * * LQT_FindFirstWIDMatchingPattern *
*/ API t_WID LQT_FindNextWIDMatchingPattern( db, WID, Pattern, PatternLength, PrefixLength, Matcher, Argument ) t_LQTEXT_Database *db; t_WID WID; unsigned char *Pattern; int PatternLength; int PrefixLength; int (* Matcher)( # ifdef HAVE_PROTO t_LQTEXT_Database *thedb, unsigned char *theString, int theStringLength, unsigned char *thePattern, int thePatternLength, int thePrefixLength, unsigned char *theArgument # endif ); unsigned char *Argument; { unsigned char *Buffer; t_WID CurrentGuess = WID; t_WID BiggestPossible = LQT_GetMaxWID(db); unsigned long L; int i; for (;;) { ++CurrentGuess; if (CurrentGuess > BiggestPossible) { return (t_WID) 0; } Buffer = ReadWIDIndexBlock(db, CurrentGuess); if (!Buffer) { /*CANTHAPPEN, as they say...*/ Error(E_WARN|E_INTERNAL, "LQT_FindNextWIDMatchingPattern: ReadWIDIndexBlock %lu -> 0", CurrentGuess ); return (t_WID) 0; } Buffer = ReadWIDIndexBlock(db, CurrentGuess); if (PrefixLength) { i = CompareWordInBlockWithPrefix( CurrentGuess, Buffer, &L, Pattern, PrefixLength ); if (i > 0) { return (t_WID) 0; } } i = CompareWordInBlockWithPattern( db, CurrentGuess, Buffer, Pattern, PatternLength, PrefixLength, Matcher, Argument ); if (i == 0) { return CurrentGuess; } else if (i == LQT_WIDMATCH_FAILED) { return (t_WID) 0; } } /*NOTREACHED*/ } /* * LQT_FindNextWIDMatchingWildCard * Database/Retrieval, Database/Words * *

Returns the lowest WID whose word matches the given pattern, * and that is greater than the given WID argument. * The pattern is a string, which must be an all-lower-case prefix. * The given wildcard character must be either * or ?, to indicate * zero or more following characters or exactly one following character, * respectively.

*

The Prefix need not be nul-terminated; the given PrefixLength * argument is used to find the end of the prefix.

* * The WID on success, and zero on failure. * * Warns if a database format error is detected. * * LQT_FindFirstWIDMatchingWildCard *
*/ API t_WID LQT_FindNextWIDMatchingWildCard(db, WID, Prefix, PrefixLength) t_LQTEXT_Database *db; t_WID WID; char *Prefix; int PrefixLength; { Error(E_FATAL, "LQT_FindNextWIDMatchingWildCard broken."); /*NOTREACHED*/ return 0; } /* * LQT_WIDToWordInfo * Database/Retrieval, Database/Words * * Returns the in-memory WordInfo structure for a given WID. * * *
  • t_WordInfo * on success; *
  • NULL on failure, or if th given WID argument was zero. * * * Warns if a database format error is detected. * * LQT_WordToWID * */ API t_WordInfo * LQT_WIDToWordInfo(db, WID) t_LQTEXT_Database *db; t_WID WID; { unsigned char *q; unsigned char *Buffer; t_WordInfo *WP; if (!WID) { /* You could argue that this should be an error, but it turns out * to be too easy to forget to check a WID before calling * LQT_WIDToWordInfo> */ return (t_WordInfo *) 0; } Buffer = q = ReadWIDIndexBlock(db, WID); if (!q) { /*CANTHAPPEN, as they say...*/ Error(E_WARN|E_INTERNAL, "LQT_WIDToWordInfo(db, 0x%x, %ld): ReadWIDIndexBlock returned 0", db, WID ); return (t_WordInfo *) 0; } if (db->WordsInWordIndex) { unsigned long L; if (LQT_sReadNumber(&q, &L, Buffer, WIDBLOCKSIZE) < 0 || L == 0) { int tmp = WIDBLOCKSIZE; LQT_PrintBlock( db, WID, &q, &Buffer, &tmp, (long *) 0 ); Error(E_WARN, "LQT_WIDToWordInfo: Database corrupt, WID %lu has wordlen %ld", WID, L ); return (t_WordInfo *) 0; } WP = LQT_MakeWordInfo(db, WID, (int) L, q); q += L; } else { unsigned char buf[25]; (void) sprintf(buf, "W%d", WID); WP = LQT_MakeWordInfo(db, WID, (int) strlen(buf), buf); } LQT_sReadNumber(&q, &WP->Offset, Buffer, WIDBLOCKSIZE); WP->Offset *= BLOCKSIZE; /* q[0] is the least significant byte. What happened to PUT4/GET4? */ if (WP->Offset != 0L) { unsigned long L; L = ui(q[3] & 255); L <<= 8; L |= ui(q[2] & 255); L <<= 8; L |= ui(q[1] & 255); L <<= 8; L |= ui(q[0] & 255); WP->NumberOfWordPlaces = L; q += 4; } else { LQT_sReadNumber(&q, &WP->NumberOfWordPlaces, Buffer, WIDBLOCKSIZE); } /* Now, maybe read some WordPlace tuplets: */ if (q - Buffer < WIDBLOCKSIZE) { if (WP->Offset == 0L) { /* In this case, all of the matches, if any, * fit in the index entry, so we might as well deal with * them now. */ WP->WordPlaces = LQT_GetWordPlaces( db, WP->WID, q, WIDBLOCKSIZE - (q - Buffer), 0L, &WP->NumberOfWordPlaces ); WP->DataBlock = (unsigned char *) 0; WP->WordPlacesInHere = WP->NumberOfWordPlaces; } else { /* Save the data block so that we can use it later */ WP->DataBlock = (unsigned char *) emalloc( "WIDblk", WIDBLOCKSIZE); (void) bcopy( (char *) Buffer, (char *) WP->DataBlock, WIDBLOCKSIZE ); WP->WordPlaceStart = &(WP->DataBlock[q - Buffer]); WP->WordPlaces = (t_WordPlace *) 0; WP->WordPlacesInHere = 0L; } } else { Error(E_BUG, "block too small for %ld (%s)", WP->WID, WP->Word); } /* done! */ return WP; } /* * LQT_MakeWordInfoBlockHeader * Database/Update, Database/Words * * Writes a database header block (a WIDindex entry) into the * given WordInfo. This is split into a separate routine so that * the library can write a word block header tentatively, using a * different format for the header if the header and the data all * fit into the index block. LQT_MakeWordInfoBlockHeader determines * the format to use by whether WordInfo->Offset is non-zero. * The difference is whether a fixed four bytes are used for the * total number of word places for this word, or whether a variable * number of bytes, using LQT_sWriteNumber, are written. In the * latter case, update in place is not possible, and this format is * therefore only used when WordInfo->Offset is zero, and any update * would in any case have to read and rewrite the word index block. * */ LIBRARY void LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock) t_LQTEXT_Database *db; t_WordInfo *WordInfo; t_pblock *pblock; { unsigned char PairBuffer[WIDBLOCKSIZE]; unsigned char *q = PairBuffer; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, "LQT_MakeWordInfoBlockHeader for %s, Offset %lu==%lu", WordInfo->Word, pblock->ChainStart, WordInfo->Offset ); } #endif if (db->WordsInWordIndex) { (void) LQT_sWriteNumber(&q, UL(WordInfo->Length), q, WIDBLOCKSIZE); (void) strncpy(q, WordInfo->Word, WordInfo->Length); q += WordInfo->Length; } if (pblock && pblock->ChainStart) { register unsigned long L; LQT_sWriteNumber( &q, UL(pblock->ChainStart / BLOCKSIZE), q, WIDBLOCKSIZE - (q - PairBuffer) ); L = WordInfo->NumberOfWordPlaces; q[0] = (L & ui(255)); /* least significant */ L >>= 8; q[1] = (L & ui(255)); L >>= 8; q[2] = (L & ui(255)); L >>= 8; q[3] = (L & ui(255)); /* most significant */ q += 4; } else { LQT_sWriteNumber(&q, 0L, q, WIDBLOCKSIZE - (q - PairBuffer)); /* offset */ LQT_sWriteNumber( &q, WordInfo->NumberOfWordPlaces, q, WIDBLOCKSIZE - (q - PairBuffer) ); } if (WordInfo->DataBlock) { efree((char *) WordInfo->DataBlock); } WordInfo->DataBlock = (unsigned char *) emalloc( "WID Data Block", WIDBLOCKSIZE ); (void) bcopy( PairBuffer, WordInfo->DataBlock, WIDBLOCKSIZE ); WordInfo->WordPlaceStart = &(WordInfo->DataBlock[q - PairBuffer]); } /* * LQT_UpdateWIDMatchCount * Database/Update, Database/Words * * Revises the count of the number of occurrences of the given word * held in the WIDindex file. It is the caller's responsibility to * ensure that this number is the same as the number of matches that * are stored with LQT_WriteWordPlaces before the next call to * LQT_GetWordPlaces. In particular, reducing the number of occurrences * with this call will not cause word places to be deleted; a fatal * (E_BUG) error will generally be produced on trying to read back a word * with an inconsistent Match Count. * * It's a fatal error (E_BUG) if the WID isn't in the index. * */ API void LQT_UpdateWIDMatchCount(db, WID, AddedThese) t_LQTEXT_Database *db; t_WID WID; unsigned long AddedThese; { unsigned long Current; unsigned long Offset; unsigned char *q; unsigned char *BlockStart; unsigned long Total; q = ReadWIDIndexBlock(db, WID); if (!q) { Error(E_BUG|E_FATAL, "%s: %d: LQT_UpdateWIDMatchCount: couldn't read WID block for %lu", __FILE__, __LINE__, WID ); } BlockStart = q; if (db->WordsInWordIndex) { unsigned long L; if (LQT_sReadNumber(&q, &L, BlockStart, WIDBLOCKSIZE) < 0 || L == 0) { Error(E_WARN, "LQT_WIDToWordInfo: db corrupt, WID %lu has wordlength %ld", WID, L ); return; } q += L; /* skip over the word */ } /* read the start of the chain */ LQT_sReadNumber(&q, &Offset, BlockStart, WIDBLOCKSIZE); if (!Offset) { Error(E_BUG|E_FATAL, "%s: %d: LQT_UpdateWIDMatchCount(WID %lu, add %lu pairs) but offset is 0", __FILE__, __LINE__, WID, AddedThese ); } Offset *= BLOCKSIZE; { unsigned long L; L = ui(q[3] & 255); L <<= 8; L |= ui(q[2] & 255); L <<= 8; L |= ui(q[1] & 255); L <<= 8; L |= ui(q[0] & 255); Current = L; } Current += AddedThese; Total = Current; q[0] = (Current & ui(255)); /* least significant */ Current >>= 8; q[1] = (Current & ui(255)); Current >>= 8; q[2] = (Current & ui(255)); Current >>= 8; q[3] = (Current & ui(255)); #ifdef ASCIITRACE { unsigned long L; L = q[3]; L <<= 8; L |= q[2]; L <<= 8; L |= q[1]; L <<= 8; L |= q[0]; if (L != Total) { Error(E_ABORT|E_BUG|E_FATAL, "LQT_UpdateWIDMatchCount(db, %ld, %ld) L %ld != C %ld", WID, AddedThese, L, Total ); } } #endif LQT_WriteWordInfoIndexBlock(db, WID, BlockStart); } /* * LQT_MakeWordInfoBlock * Database/Update, Database/Words * * Tries to put the given pblock into the given WordInfo's index * block, a buffer reserved for this purpose. * * *
  • the number of places successfully added *
  • 0 if no word places were given in pblock * * * LQT_PutWordInfoIntoIndex, LQT_MakeWordInfoBlockHeader * * Warns if WordInfo already has a non-zero Offset. * */ API unsigned long LQT_MakeWordInfoBlock(db, WordInfo, pblock) t_LQTEXT_Database *db; t_WordInfo *WordInfo; t_pblock *pblock; { /* See how many pairs from the given pblock fit into WordInfo... * and leave them there for later use. */ #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_WORDINFO)) { LQT_Trace(LQTRACE_WORDINFO, "LQT_MakeWordInfoBlock for %s/%d at %ld", WordInfo->Word, WordInfo->WID, WordInfo->Offset ); } #endif LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock); if (pblock == (t_pblock *) 0) { /* No WordPlaces to put in! */ WordInfo->WordPlacesInHere = 0; return 0; } if (pblock->ChainStart != 0L) { Error(E_WARN, "liblqtext/WordInfo.c::LQT_MakeWordInfoBlock() pblock->ChainStart %ld != 0", pblock->ChainStart ); } return WordInfo->WordPlacesInHere = LQT_WriteWordPlaces( db, pblock->WordPlaces, WordInfo->WID, (unsigned long) 0L, /* 0 means 1st block not in data file */ WordInfo->DataBlock, (unsigned char *) WordInfo->WordPlaceStart, WIDBLOCKSIZE, 0L, /* start */ 0L, /* blocksize */ pblock->NumberOfWordPlaces ); } /* * LQT_WordToWID * Database/Retrieval, Database/Words * * Returns the WID for a given Word. * It is not necessary that the word be NUL terminated. * The Length argument is the number of bytes in the Word, * not including any trailing NUL byte * * *
  • the WID on success *
  • 0 on failure * * * LQT_WIDToWordInfo * * Fatal error if the database can't be opened. * */ API t_WID LQT_WordToWID(db, Word, Length) t_LQTEXT_Database *db; char *Word; unsigned int Length; { datum key, data; unsigned char *q; t_WID WID; DBM *theWordMap; if (Length > db->MaxWordLength) { Length = db->MaxWordLength; /* NOTE: no trailing \0 required. */ } /* contact database server */ theWordMap = LQT_OpenKeyValueDatabase(db, db->WordIndex); if (theWordMap == (DBM *) 0) { Error(E_FATAL|E_SYS, "LQT_WordToWID: Couldn't open Word Index (dbm) database \"%s\"", db->WordIndex ); } key.dptr = Word; key.dsize = Length; data = dbm_fetch(theWordMap, key); if (data.dptr == (char *) 0 || data.dsize == 0) { LQT_CloseKeyValueDatabase(theWordMap); return (t_WID) 0; } q = (unsigned char *) data.dptr; LQT_sReadNumber(&q, &WID, (unsigned char *) data.dptr, data.dsize); if (q - (unsigned char *) data.dptr != data.dsize) { Error(E_BUG, "WordToWid \"%*s\" failed... got %lu", Length, Word, WID); } if (WID > db->LQTp__LastNextWIDVal) { (void) LQT_GetMaxWID(db); } if (WID > db->LQTp__LastNextWIDVal) { Error(E_BUG, "LQT_WordToWID(%*s) value is %ld, but max WID is %ld!", Length, Word, WID, db->LQTp__LastNextWIDVal ); } LQT_CloseKeyValueDatabase(theWordMap); return WID; } /* * LQT_WIDToWord * Database/Retrieval, Database/Words * * Returns the word corresponding to a given WID. * * *
  • the word on success *
  • zero on failure, or if the wordlist database parameter was set to * off when the word was last written to the database * * * LQT_WIDToWord may be inefficient or unavailable if the wordlist * parameter in the database config file is set to off. * See the lqwordlist program for alternate ways of obtaining access * to the index vocabulary. * */ API char * LQT_WIDToWord(db, WID) t_LQTEXT_Database *db; t_WID WID; { t_WordInfo *W; char *Word; if (WID == (t_WID) 0) { return (char *) 0; } if ((W = LQT_WIDToWordInfo(db, WID)) == (t_WordInfo *) 0) { return (char *) 0; } Word = emalloc(W->Word, W->Length + 1); (void) strncpy(Word, W->Word, (int) W->Length); LQT_DestroyWordInfo(db, W); return Word; } /* * LQT_WriteWordAndWID * Database/Update, Database/Words * * Saves the WID --> Word mapping in the wordlist database. * * the given WID. * * Fatal error if the database can't be opened, or if the * word couldn't be stored. * * The reverse map, Word --> WID, is performed using LQT_WIDToWord, * and uses the copy of the word stored in the widindex block header. * * LQT_WIDToWord * LQT_WordToWID * LQT_PutWordInfoIntoIndex * */ API t_WID LQT_WriteWordAndWID(db, Word, Length, WID) t_LQTEXT_Database *db; char *Word; int Length; t_WID WID; { unsigned char NumBuf[sizeof(t_WID) * 8/7 + 1]; unsigned char *q = NumBuf; datum key, data; DBM *theWordMap; key.dptr = Word; key.dsize = Length; LQT_sWriteNumber(&q, WID, q, sizeof NumBuf); data.dptr = (char *) NumBuf; data.dsize = q - NumBuf; /* contact database server */ theWordMap = LQT_OpenKeyValueDatabase(db, db->WordIndex); if (theWordMap == (DBM *) 0) { Error(E_FATAL|E_SYS, "LQT_WriteWordAndWID: Couldn't open dbm Word Index \"%s\"", db->WordIndex ); } if (dbm_store(theWordMap, key, data, DBM_REPLACE) < 0) { Error(E_FATAL|E_SYS, "WID %ld: dbm_store of %*s failed", WID, Length, Word ); } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_READAFTERWRITE)) { t_WID W; if ((W = LQT_WordToWID(db, Word, Length)) != WID) { Error(E_BUG, "LQT_WriteWordAndWID: stored %ld, but retrieved %ld", WID, W ); } } #endif LQT_CloseKeyValueDatabase(theWordMap); return WID; } /* * LQT_PutWordInfoIntoIndex * Database/Update, Database/Words * *

    Each WordInfo structure contains a pointer to a single data block, * which is used to store the widindex header. * This speeds up indexing, since the header is needed at both the * start of writing out WordPlaces and at the end. * LQT_PutWordInfoIntoIndex arranges that index block be written * out to the widindex index file, using LQT_WriteWordInfoIndexBlock.

    *

    A WID must have been allocated for this word with LQT_WriteWordAndWID * for this word already, on this or some other program run.

    *

    This routine is generally called after LQT_Writepblock.

    * * zero * * Warns if the WordInfo has a datablock but no offset. * If ASCIITRACE was defined when the library was compiled, and if * the LQTRACE_READAFTERWRITE trace flag is set, LQT_PutWordInfoIntoIndex * checks that theWordinfo->WID corresponds to theWordInfo->Word, * using LQT_WordToWID, and produces a fatal (E_BUG) error if not. *
    */ API int LQT_PutWordInfoIntoIndex(db, theWordInfo, Offset) t_LQTEXT_Database *db; t_WordInfo *theWordInfo; unsigned long Offset; { if (theWordInfo->DataBlock == (unsigned char *) 0) { if (Offset) { Error(E_WARN|E_INTERNAL, "%s: %d: WordInfo corrupt for \"%*.*s\"", __FILE__, __LINE__, theWordInfo->Length, theWordInfo->Length, theWordInfo->Word ); } (void) LQT_MakeWordInfoBlock(db, theWordInfo, (t_pblock *) 0); } LQT_WriteWordInfoIndexBlock(db, theWordInfo->WID, theWordInfo->DataBlock); #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_READAFTERWRITE)) { t_WID w = LQT_WordToWID(db, theWordInfo->Word, theWordInfo->Length); if (w != theWordInfo->WID) { Error(E_BUG, "Word \"%*.*s\": WID changed from %ld to %ld!", theWordInfo->Length, theWordInfo->Length, theWordInfo->Word, theWordInfo->WID, w ); } } #endif return 0; } /* * LQT_DeleteWordFromIndex * Database/Update, Database/Words * * Deletes the given word and associated data from the database. * The WID index entry for the LQT_WIDToWord function entry is retained, * as is the widindex file record, with a match count of zero. * If the word should appear in some subsequently indexed file, this * space is reclaimed. * * *
  • zero on success *
  • -1 on error * * * See LQC_UnIndexFile in the lqunindex client for an example of * using this function. * */ API int LQT_DeleteWordFromIndex(db, Word) t_LQTEXT_Database *db; char *Word; { t_WID WID; t_WordInfo *WordInfo; t_pblock *tmp; if ((WID = LQT_WordToWID(db, Word, strlen(Word))) == (t_WID) 0) { return -1; /* not there */ } /* get info from the list */ if ((WordInfo = LQT_WIDToWordInfo(db, WID)) == (t_WordInfo *) 0) { return -1; } if ((tmp = LQT_Getpblock(db, WordInfo)) != (t_pblock *) NULL) { LQT_Deletepblock(db, tmp); (void) efree((char *)tmp); } /* delete the offset from the database, but retain the WID: */ WordInfo->Offset = 0L; WordInfo->NumberOfWordPlaces = 0L; WordInfo->WordPlacesInHere = 0; LQT_PutWordInfoIntoIndex(db, WordInfo, 0L); LQT_DestroyWordInfo(db, WordInfo); return 0; } static t_WordInfo ZeroWordinfo = { 0, }; /* * LQT_MakeWordInfo * Memory, Database/Words * * Constructs a new t_WordInfo structure containing a malloc'd and * NUL terminated copy of the given word. The word as passed into * LQT_MakeWordInfo need not be NUL terminated; the Length parameter * is the number of bytes in the Word string, not counting the * trailing NUL, if present. * * LQT_ReadWordFromStringPointer, LQT_DestroyWordInfo, LQT_WordToWID * * Fatal error if there isn't enough memory * */ API t_WordInfo * LQT_MakeWordInfo(db, WID, Length, Word) t_LQTEXT_Database *db; t_WID WID; int Length; unsigned char *Word; { register t_WordInfo *WP; WP = (t_WordInfo *) emalloc("LQT_MakeWordInfo", sizeof(t_WordInfo)); *WP = ZeroWordinfo; /* structure copy */ WP->WID = WID; WP->Word = emalloc("LQT_MakeWordInfo.Word", Length + 1); (void) strncpy(WP->Word, Word, Length); WP->Word[WP->Length = Length] = '\0'; /* strncpy does not add a null */ return WP; } /* * LQT_DestroyWordInfo * Memory, Database/Words * * Deletes the given structure from memory, reclaiming storage. * This routine does not affect the database. * * LQT_DestroyFileInfo * LQT_DeleteWordFromIndex * LQT_MakeWordInfo * */ API void LQT_DestroyWordInfo(db, WP) t_LQTEXT_Database *db; t_WordInfo *WP; { if (!WP) return; if (WP->Word) efree(WP->Word); if (WP->WordPlaces) efree((char *) WP-> WordPlaces); if (WP->DataBlock) efree((char *) WP->DataBlock); efree((char *) WP); } /* * LQT_fprintWordInfo * Database/Words * * Prints an ASCII representation of the given WordInfo pointer to * the given stdio stream. The Caller argument is printed before * each line of output, and is usually the name of the function * calling LQT_fprintWordInfo. * */ API void LQT_fprintWordInfo(db, stream, W, Caller) t_LQTEXT_Database *db; FILE *stream; t_WordInfo *W; char *Caller; { fprintf(stream, "%s: WordInfo 0x%x: {\n", Caller, W); (void) fflush(stderr); if (W) { fprintf(stream, "\tWID: %ld (%s, len %u)\n", W->WID, W->Word, (unsigned int) W->Length); fprintf(stream, "\tNumberOfWordPlaces: %lu In here: %d\n", W->NumberOfWordPlaces, W->WordPlacesInHere); fprintf(stream, "\tFID: %ld; Offset: %lu\n", W->FID, W->Offset); if (W->DataBlock) fprintf(stream, "\tDataBlock: 0x%x\n", W->DataBlock); if (W->WordPlaceStart) { fprintf(stream, "\tWordPlaceStart: 0x%x\n", W->WordPlaceStart); } if (W->WordPlaces) { fprintf(stream, "\tWordPlaces: 0x%x\n", W->WordPlaces); } if (W->WordPlace.FID && W->WordPlace.FID != W->FID) { fprintf(stream, "\tWordPlace->FID: %ld != FID\n", W->WordPlace.FID); } fprintf(stream, "\tWordPlace: (Block: %lu; Word %lu", W->WordPlace.BlockInFile, W->WordPlace.WordInBlock); if (W->WordPlace.Flags || W->WordPlace.StuffBefore) { fprintf(stream, "; Flags %u", W->WordPlace.Flags); fprintf(stream, "; StuffBefore: %u", (unsigned int) W->WordPlace.StuffBefore); } fprintf(stream, ")\n"); fprintf(stream, "} %s: WordInfo 0x%x\n", Caller, W); } fflush(stream); }