/* rpblock.c -- Copyright 1989, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ #ifndef LINE static char *RcsId = "@(#) $Id: rpblock.c,v 1.24 2001/05/31 03:50:13 liam Exp $"; #endif #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include /* stderr, also for fileinfo.h */ #include #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "pblock.h" #include "numbers.h" #include "emalloc.h" #include "wordrules.h" #include "getbyte.h" #include "liblqtext.h" #include "lqtrace.h" /** Unix system calls that need to be declared: **/ /** C library functions that need to be declared: **/ /** lqtext library functions that need to be declared: **/ /** Functions within this file that need to be declared: **/ /** **/ static int (* CheckFunction)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif ) = 0; /* * LQT_GetpblockWhere * Database/Retrieval, Database/Update, Database/Physical * * Look up a word in the database... * and return a list of all the WordPlaces where it's found. * The AcceptFunc is called for each place as it is read off the * disk, with the given db, the WID and the new WordPlace as arguments. * If the * AcceptFunc returns a positive value, the WordPlace is accepted; * otherwise, it is not included in the returned t_pblock. Note that * it is possible to end up with a pblock with no WordPlaces at all * if the AcceptFunc never returns a positive value. * An AcceptFunc of NULL is considered to return 1 in every case. * * a freshly malloc'd t_pblock containing all of the WordPlaces from * the disk that the AcceptFunc accepted, and with NumberOfWordPlaces * set to the number of such places. * * Normally you would use LQT_MakeMatches instead of this function. * This function is used internally, and also by lq-text clients that * update the database efficiently. * * Database format errors are nearly always fatal. * * LQT_MakeMatches * */ API t_pblock * LQT_GetpblockWhere(db, WordInfo, AcceptFunc) t_LQTEXT_Database *db; t_WordInfo *WordInfo; int (* AcceptFunc)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif ); { t_pblock *Result; int (* OldWhere)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif ) = CheckFunction; CheckFunction = AcceptFunc; Result = LQT_Getpblock(db, WordInfo); CheckFunction = OldWhere; return Result; } #include "blkheader.h" LIBRARY void LQT_PrintBlock(db, theWID, currentPos, dataStart, blockLength, nextOffset) t_LQTEXT_Database *db; t_WID theWID; unsigned char **currentPos; unsigned char **dataStart; int *blockLength; long *nextOffset; { register unsigned char *p; (void) fflush(stdout); (void) fflush(stderr); fprintf(stderr, "\n%s: @@@@ Block Dump @@@@\n", progname); if (nextOffset && !*nextOffset) { t_BlockHeader *BH; BH = (t_BlockHeader *) *dataStart; fprintf(stderr, "%s: pos %d <= %d, Header: next=%ld, len=%d, WID %ld", progname, *currentPos - *dataStart, *blockLength, BH->NextOffset, BH->NumberOfBlocks, theWID ); #ifdef WIDINBLOCK if (theWID == BH->WID) { fprintf(stderr, " OK"); } else { fprintf(stderr, " BAD, in-block value %ld", BH->WID); } #endif fprintf(stderr, "\n"); } for (p = (*dataStart); p < *currentPos; p++) { fprintf(stderr, "%3o ", *p); } fprintf(stderr, "\n"); (void) fflush(stderr); } /* * LQT_Getpblock * Database/Retrieval, Database/Physical * * Returns a freshly malloc'd t_pblock containing all of the WordPlaces * for a given WordInfo; one for each occurrence of that word in the * database. * * *
  • the number of words added on success; *
  • -1 if the file couldn't be opened. * * * Warns if the file can't be opened. * * LQT_GetpblockWhere * */ API t_pblock * LQT_Getpblock(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { t_pblock *pblock = 0; unsigned long HowManyToGet = 0L; t_WordPlace *WordPlaces; if (!WordInfo->NumberOfWordPlaces) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_GETPLACES)) { LQT_fprintWordInfo(db, stderr, WordInfo, "LQT_Getpblock[A]"); } #endif Error(E_BUG, "LQT_Getpblock: attempt to fetch \"%s\" with no matches!", WordInfo->Word ? WordInfo->Word : "(null)" ); } HowManyToGet = WordInfo->NumberOfWordPlaces; /* a pblock already contains the first WordPlace, so we only need * to allocate HowManyToGet - 1. We allow one extra to help us detect * the case where the database is corrupt and we overshot. */ pblock = (t_pblock *) emalloc( "pblock for LQT_Getpblock", sizeof(t_pblock) + (unsigned) HowManyToGet * sizeof(t_WordPlace) ); WordPlaces = pblock->WordPlaces; pblock->WID = WordInfo->WID; pblock->ChainStart = WordInfo->Offset; pblock->NumberOfWordPlaces = WordInfo->NumberOfWordPlaces; /* First, the pairs in the WordInfo might suffice: */ if (WordInfo->WordPlacesInHere >= HowManyToGet) { unsigned long CurrentPlace; unsigned long Destination = 0; for (CurrentPlace = 0L; CurrentPlace < WordInfo->WordPlacesInHere; CurrentPlace++) { if (CheckFunction == (int (*)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif )) 0 || ( CheckFunction( db, pblock->WID, &(WordInfo->WordPlaces[CurrentPlace]) ) > 0) ) { WordPlaces[Destination++] = WordInfo->WordPlaces[CurrentPlace]; } } /* If they all fitted in the WordInfo block, well, that was a big win! */ if (CurrentPlace >= HowManyToGet) { /* pblock->ChainStart = 0L; */ WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces = Destination; return pblock; } } /* So we need to read the entire list of WordPlaces from the database. * Although we may have already done the first few, I'm going to do them * all again because that ensures that the last few bytes in the * WordInfo data block can get used! */ WordPlaces = LQT_GetWordPlaces( db, WordInfo->WID, WordInfo->WordPlaceStart, (unsigned int) (WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock)), WordInfo->Offset, &HowManyToGet ); if (WordPlaces == (t_WordPlace *) 0) { #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_GETPLACES)) { LQT_fprintWordInfo(db, stderr, WordInfo, "LQT_Getpblock[B]"); } #endif Error(E_BUG, "no wordplaces for WID %ld, wanted %ld", WordInfo->WID, HowManyToGet ); } /* copy the result... */ (void) bcopy( (char *) WordPlaces, (char *) pblock->WordPlaces, (int) (sizeof(t_WordPlace) * HowManyToGet) ); WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces = HowManyToGet; (void) efree((char *) WordPlaces); return pblock; } /* * LQT_GetWordPlacesWhere * Database/Retrieval, Database/Physical * *

    Used to read the matches from disk for the given WID.

    *

    A WordPlace describes a single occurrence of a word. * Hence, if you call this function with the WID of `the', you'll * get back an array large enough to hold every occurrence of `the' * in the entire database. The AcceptFunc argument is a function that * is called before each match is inserted into the array; it * can return either zero or one. If it returns zero, the match is * not inserted into the array; this can save memory, and also allows * you to process the matches as they are read from disk, instead of * waiting for them all before doing anything with them.

    *

    The given Block argument is a pointer to an in-memory buffer holding * the first few bytes of data; usually this comes from the `widindex' * fixed record length file.

    * *

    This function is very low-level; normally, you should use * LQT_MakeMatches or LQT_MakeMatchesWhere instead.

    * * LQT_GetWordPlaces * LQT_GetpblockWhere * LQT_StringToPhrase * LQT_MakeMatchesWhere *
    */ API t_WordPlace * LQT_GetWordPlacesWhere( db, WID, Block, BlockLength, NextOffset, NumberExpected, AcceptFunc ) t_LQTEXT_Database *db; t_WID WID; unsigned char *Block; unsigned int BlockLength; unsigned long NextOffset; unsigned long *NumberExpected; int (* AcceptFunc)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif ); { t_WordPlace *Result; int (* OldWhere)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif ) = CheckFunction; CheckFunction = AcceptFunc; Result = LQT_GetWordPlaces( db, WID, Block, BlockLength, NextOffset, NumberExpected ); CheckFunction = OldWhere; return Result; } /* * LQT_GetWordPlaces * Database/Retrieval, Database/Physical * *

    Reads all the places for a given word into memory, and returns * a freshly malloc'd array of t_WordPlaces. * It is the caller's responsibility to free the resulting array.

    *

    The arguments are as for LQT_GetWordPlacesWhere.

    * * LQT_GetWordPlacesWhere * LQT_MakeMatchesWhere *
    */ API t_WordPlace * LQT_GetWordPlaces(db, WID, Block, BlockLength, NextOffset, NumberExpected) t_LQTEXT_Database *db; t_WID WID; unsigned char *Block; unsigned int BlockLength; unsigned long NextOffset; unsigned long *NumberExpected; { register long CurrentPlace = 0; unsigned char *q = Block; unsigned long L; t_WordPlace *Places = (t_WordPlace *) 0; t_FID LastFID = (t_FID) 0; unsigned LastBlock = 0L; unsigned char LastFlags = 0; unsigned long OriginalCount = *NumberExpected; #ifdef ASCIITRACE LQT_Trace(LQTRACE_GETPLACES, "LQT_GetWordPlaces WID %ld Blk 0x%x len %d next %ld No. %ld", WID, Block, BlockLength, NextOffset, *NumberExpected ); #endif if (Block == (unsigned char *) 0) { Error(E_BUG, "LQT_GetWordPlaces WID %lu, zero block", WID); } /*NOSTRICT*/ Places = (t_WordPlace *) emalloc( "WordPlaces for LQT_GetWordPlaces", sizeof(t_WordPlace) * (*NumberExpected) ); while (CurrentPlace < *NumberExpected) { unsigned long NumberOfRepeats; unsigned char Uchar; t_FID FID; /** First get the FID. The bottom bit of the number stored ** actually determines whether there are multiple Places ** stored here for the same FID. **/ L = LQTp_GetLong(db, WID, &q, &Block, &BlockLength, &NextOffset); if (L == 0L) { /* a null byte is used to separate sequences that were * appended on separate runs. */ L = LQTp_GetLong(db, WID, &q, &Block, &BlockLength, &NextOffset); LastFID = 0; LastFlags = 0; } FID = (L >> 1) + LastFID; /* Shift to remove flag bit */ /* a one in the last place means multiple matches for the same FID. */ if (FID == 0) { LQT_PrintBlock( db, WID, &q, &Block, (unsigned int *) &BlockLength, &NextOffset ); Error(E_BUG, "LQT_GetWordPlaces WID %ld, FID %ld (out of %ld <= %ld) is Zero!", WID, CurrentPlace, *NumberExpected, OriginalCount ); } LastFID = FID; NumberOfRepeats = (L & 01L) ? LQTp_GetLong(db, WID, &q, &Block, &BlockLength, &NextOffset) : 1L; /* Quick Sanity check */ /* This is probably cheap enough that we can do it all the time */ switch (NumberOfRepeats) { case 0L: LQT_PrintBlock(db, WID, &q, &Block, &BlockLength, &NextOffset); Error(E_BUG, "LQT_GetWordPlaces WID %ld: no entries! for FID %lu", WID, FID ); case 1L: if (L & 01L) { LQT_PrintBlock(db, WID, &q, &Block, &BlockLength, &NextOffset); Error(E_WARN, "%ld, FID %lu repeated 1 times!", WID, FID ); } } LastBlock = 0L; if (CurrentPlace + NumberOfRepeats > *NumberExpected) { LQT_PrintBlock(db, WID, &q, &Block, &BlockLength, &NextOffset); Error(E_BUG, "LQT_GetWordPlaces: FID %lu WID %ld has %lu matches != %lu/%lu", FID, WID, CurrentPlace + NumberOfRepeats + 1, *NumberExpected, OriginalCount ); } for (; NumberOfRepeats != 0; --NumberOfRepeats) { Places[CurrentPlace].FID = FID; #ifdef DEBUGPLACES Uchar = LQTp_GetByte(db, WID, &q, &Block, &BlockLength, &NextOffset); if (Uchar != (unsigned char) '{') { LQT_PrintBlock(db, WID, &q, &Block, &BlockLength, &NextOffset); Error(E_BUG|E_FATAL|E_ABORT, "%s: %d: Expected %u, got %u instead", __FILE__, __LINE__, (unsigned char) '{', Uchar ); } #endif /* a b c d e f g h * a=0, bc contain delta block, defg contain WIB, h is flag bit * a=1, b=1 cdefgh contains the start of delta block * a=1, b=0 cdefgh contains delta block * Need to leave bottom bit of Uchar as flag bit. * * 0 B B W W W W F * 1 1 B B B B B B, B continues * 1 0 B B B B B B */ Uchar = LQTp_GetByte(db, WID, &q, &Block, &BlockLength, &NextOffset); if ((Uchar & (unsigned char) 0200) == 0) { if (Uchar & 0100) { ++LastBlock; } Places[CurrentPlace].WordInBlock = ( (Uchar & 077) >> 1); L = Uchar; /* for the test below */ } else { if (Uchar & 0100) { L = (Uchar & 0077); LastBlock += L; L = LQTp_GetLong(db,WID,&q,&Block,&BlockLength,&NextOffset); L <<= 6; LastBlock += L; } else { L = (Uchar & 0077); LastBlock += L; } /* word in block: */ L = LQTp_GetLong(db,WID,&q,&Block,&BlockLength,&NextOffset); Places[CurrentPlace].WordInBlock = (L >> 1); } Places[CurrentPlace].BlockInFile = LastBlock; #ifdef ASCIITRACE /* Sanity check: */ if (CurrentPlace > 0 && Places[CurrentPlace].FID == Places[CurrentPlace - 1].FID) { if (Places[CurrentPlace - 1].BlockInFile == Places[CurrentPlace].BlockInFile) { if (Places[CurrentPlace - 1].WordInBlock >= Places[CurrentPlace].WordInBlock) { LQT_PrintBlock(db,WID,&q,&Block,&BlockLength,&NextOffset); Error(E_BUG, "LQT_GetWordPlaces: WID %ld match %d FID %ld WIB %ld >= %ld! [byte %d]", WID, CurrentPlace, FID, Places[CurrentPlace - 1].WordInBlock, Places[CurrentPlace].WordInBlock, q - (unsigned char *) Block ); } } else if (Places[CurrentPlace - 1].BlockInFile > Places[CurrentPlace].BlockInFile) { LQT_PrintBlock(db, WID, &q, &Block, &BlockLength, &NextOffset); Error(E_BUG, "LQT_GetWordPlaces: match %d for WID %ld FID %ld BIF decreases!", CurrentPlace, WID, FID ); } } /* end of sanity test */ #endif ASCIITRACE if (L & 01) { /* use if, not ?:, for profiler */ LastFlags = Places[CurrentPlace].Flags = LQTp_GetByte(db,WID,&q, &Block, &BlockLength, &NextOffset); } else { Places[CurrentPlace].Flags = LastFlags; } /* If there are flags, there still might not be a separate * entry for the number of preceding skipped bytes. */ if (Places[CurrentPlace].Flags & WPF_HASSTUFFBEFORE) { unsigned char ch = LQTp_GetByte(db,WID, &q, &Block, &BlockLength, &NextOffset); LQTpDisentangleFlagsAndStuff( &Places[CurrentPlace], ch ); LastFlags = Places[CurrentPlace].Flags; } else { if (Places[CurrentPlace].Flags & WPF_LASTHADPUNCT) { Places[CurrentPlace].StuffBefore = 2; } else { Places[CurrentPlace].StuffBefore = 1; } } #ifdef DEBUGPLACES Uchar = LQTp_GetByte(db,WID, &q, &Block, &BlockLength, &NextOffset); if (Uchar != (unsigned char) '}') { LQT_PrintBlock(db,WID, &q, &Block, &BlockLength, &NextOffset); Error(E_BUG|E_FATAL|E_ABORT, "%s: %d: Expected %u, got %u instead", __FILE__, __LINE__, (unsigned char) '}', Uchar ); } #endif if (CheckFunction == (int (*)( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WID, t_WordPlace * #endif )) 0 || CheckFunction(db, WID, &Places[CurrentPlace]) > 0) { ++CurrentPlace; } else { --*NumberExpected; } } } if (CheckFunction != (int (*)()) 0 && *NumberExpected != OriginalCount) { Places = (t_WordPlace *) erealloc( (char *) Places, sizeof(t_WordPlace) * (*NumberExpected) ); } return Places; }