/* wpblock.c -- Copyright 1989, 1983, 1994 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* $Id: wpblock.c,v 1.18 1996/08/20 18:36:36 lee Exp $ * */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include /* stderr, also for fileinfo.h */ #include #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "wordrules.h" #include "pblock.h" #include "putbyte.h" #include "emalloc.h" #include "liblqtext.h" #include "lqtrace.h" /** Unix system calls that need to be declared: **/ /** C library functions that need to be declared: **/ /** lqtext library functions that need to be declared: **/ /** Functions within this file that need to be declared: **/ /** **/ /* * LQT_Writepblock * Database/Update, Database/Physical * * Write out an entire (presumably new) data entry, and * return a disk pointer to the start of the chain. * * the byte offset of the first block in the newly created chain * * Fatal (E_BUG) error on format or consistency check, etc. * */ API unsigned long LQT_Writepblock(db, WordInfo, pblock) t_LQTEXT_Database *db; t_WordInfo *WordInfo; t_pblock *pblock; { unsigned int BlockLength; unsigned long placesWritten = 0; if (WordInfo->DataBlock) { (void) efree((char *) WordInfo->DataBlock); WordInfo->DataBlock = (unsigned char *) 0; } #if 0 if (pblock->NumberOfWordPlaces < WIDBLOCKSIZE * 10) { WordInfo->Offset = pblock->ChainStart = 0L; LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock); placesWritten = LQT_WriteWordPlaces( db,, pblock->WordPlaces, WordInfo->WID, (unsigned long) 0L, /* first block is in widindex, not in "data" */ WordInfo->DataBlock, (unsigned char *) WordInfo->WordPlaceStart, (unsigned) WIDBLOCKSIZE, 0L, 0, pblock->NumberOfWordPlaces ); } #endif if (placesWritten != pblock->NumberOfWordPlaces) { unsigned long BytesWanted = pblock->NumberOfWordPlaces * 3; /* a guess */ WordInfo->Offset = pblock->ChainStart = LQT_FindFreeBlock(db, WordInfo->WID, &BlockLength, BytesWanted); LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock); placesWritten = LQT_WriteWordPlaces( db, pblock->WordPlaces, WordInfo->WID, (unsigned long) 0L, /* first block is in widindex, not in "data" */ WordInfo->DataBlock, (unsigned char *) WordInfo->WordPlaceStart, (unsigned) WIDBLOCKSIZE, WordInfo->Offset, BlockLength, pblock->NumberOfWordPlaces ); } if (placesWritten != pblock->NumberOfWordPlaces) { Error(E_BUG, "LQT_Writepblock: WID %d (%*s): wrote %lu places != %lu", WordInfo->WID, WordInfo->Length, WordInfo->Word, placesWritten, pblock->NumberOfWordPlaces ); } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_PUTPLACES|LQTRACE_WORDINFO)) { t_pblock *p2; register long place; p2 = LQT_Getpblock(db, WordInfo); if (p2->NumberOfWordPlaces != pblock->NumberOfWordPlaces) { Error(E_FATAL|E_BUG, "%s: %d: places differ %ld vs. %ld", __FILE__, __LINE__, p2->NumberOfWordPlaces, pblock->NumberOfWordPlaces ); } for (place = 0; place < p2->NumberOfWordPlaces; place++) { #define PCMP(p1,p2,place) \ ((p2->WordPlaces[place].FID == p2->WordPlaces[place].FID) && \ (p2->WordPlaces[place].BlockInFile == p2->WordPlaces[place].BlockInFile) && \ (p2->WordPlaces[place].WordInBlock == p2->WordPlaces[place].WordInBlock) && \ (p2->WordPlaces[place].Flags == p2->WordPlaces[place].Flags) && \ (p2->WordPlaces[place].StuffBefore == p2->WordPlaces[place].StuffBefore)) if (!PCMP(pblock, p2, place)) { Error(E_BUG|E_FATAL, "%s: %d: !PCMP(%ld: %ld,%ld,%ld,0%o,%d/%ld,%ld,%ld,0%o,%d)", __FILE__, __LINE__, place, pblock->WordPlaces[place].FID, pblock->WordPlaces[place].BlockInFile, pblock->WordPlaces[place].WordInBlock, pblock->WordPlaces[place].Flags, pblock->WordPlaces[place].StuffBefore, p2->WordPlaces[place].FID, p2->WordPlaces[place].BlockInFile, p2->WordPlaces[place].WordInBlock, p2->WordPlaces[place].Flags, p2->WordPlaces[place].StuffBefore ); } } (void) efree((char *) p2); } #endif return WordInfo->Offset; } /* * WordPlaces are now stored as sequences, as follows: Per F: * FID*2 -- 1, 2, 3 (usually, for FID) or 4 bytes 1-5 * (very, very occasionaly a variable-size number may be 5 bytes long.) * . the bottom bit in the stored number determines whether there * is more than one FID to follow * Per M: * Number of following places (only if prev. bit was 1) -- 1 byte 0-1 * For each following entry:- * . for each of the following places: * Block In File (long, 1-5 bytes, usually 1) 1-5 * Word In Block -- always 1 byte 0-1 * the bottom bit of this says if there are flags * (if Block in file and word in block both fit into one byte, they * are combined, saving 1 byte; sometimes this adds a byte.) * Flags -- always 1 byte, if present 0-1 * (flags stored only if different from previous entry) * Stuff Before -- 1 byte 0-1 * (if there are no flags, there's no Stuff Before byte, and * we use the default value of 1, or 2 if punctuation was skipped) * Stuff After 0-1 * * Hence: each sub-place takes from 1 to 10 bytes; * each Place sequence takes from 3 * to (4 + 1) + 255 * (2..7) bytes. * In most (I guess > 7/10) cases, flags will be 0, and * StuffBefore will be the default of 1. * * In practice, though, we store the difference since the last block-in-file, * and the difference since the last FID, so that the numbers are usually * on the small side. * * I am hoping, of course, that the extra information stored is * worth while! * It might be possible to coalesce WordInBlock and BlockInFile using * delta modulation -- i.e., storing the increment from the previous. In * this case, a flag bit could mean that those two values each occupy a * nibble in a single byte. This is in fact the case, and is implemented. * It really is worth while keeping the format as simple as possible, * as this speeds retrieval. */ /* * LQT_WriteWordPlaces * Database/Update, Database/Physical * *

Writes the given WordPlaces to disk.

The given LastStart argument should be zero if the given * Block pointer refers to data that is not to be stored in the * overflow file (`data'). This will be the case when the first few * matches are to be written into the widindex entry. If the LastStart * argument is non-zero, it is the block number that will be passed as * an argument to LQT_WriteBlock to save the block when it is full.

The given NextOffset can either be zero or it can be the block * offset in the data overflow file of a block that has been allocated * using LQT_FindFreeBlock; in the latter case, the NextLength argument * is also passed on to LQT_WriteWordPlaces.

* * *

the number of words added on success; *

-1 if the file couldn't be opened. * * *

This routine is fairly low-level, * and is made available in the API for efficiency. * You should not attempt to use it without looking at examples in * the lq-text clients that update the database, and also reading * the source of the function itself.

* * Warns if the file can't be opened. * */ API unsigned long LQT_WriteWordPlaces( db, WordPlaces, WID, LastStart, Block, DataStart, BlockLength, NextOffset, NextSize, NumberToWrite ) t_LQTEXT_Database *db; t_WordPlace *WordPlaces; t_WID WID; unsigned long LastStart; unsigned char *Block; unsigned char *DataStart; unsigned int BlockLength; unsigned long NextOffset; unsigned long NextSize; unsigned long NumberToWrite; { unsigned char *q = DataStart; unsigned long L; int CurrentPlace = 0; t_FID LastFID = 0; unsigned long LastBlock = 0L; unsigned char LastFlags = 0; #ifdef ALWAYSSORT_NEVERDEFINED /* i.e. this code unused -- not needed */ /* Sort the pblock to simplify subsequent accesses, * and also to allow more space-efficient encoding, recording the change * (increment) since the previous FID or Block in the list, instead of * the actual number. The WriteNumber package works much better if * most numbers are small. */ if (NumberToWrite > 1) { LQT_SortWordPlaces(db, NumberToWrite, WordPlaces); } #endif /* invalidate the lastblock cache: */ { unsigned long tmp = 0L; LQT_SetLastBlockInChain(db, WID, &tmp, &Block[0], Block); } while (CurrentPlace < 0 || CurrentPlace < NumberToWrite) { unsigned long NumberOfRepeats; unsigned char U; int LastPlace; t_FID FID; FID = WordPlaces[CurrentPlace].FID; if (FID == 0) { Error(E_BUG, "LQT_WriteWordPlaces WID %ld, FID %ld is Zero!", WID, CurrentPlace ); } /* Determine the number of Places in the same file; * We use a long here. Previous versions used a single byte, * and if there were more than 255 occurrences of a word in the * same file, used multiple sequences. Multiple sequences can now * also occur through updates to the index, and the two cases need * to be distinguished, because in the update case the running * totals (LastFID) are reset to 0. */ NumberOfRepeats = 0; LastPlace = CurrentPlace; while (LastPlace < NumberToWrite) { if (WordPlaces[LastPlace].FID != FID) { break; } ++NumberOfRepeats; ++LastPlace; } L = (FID - LastFID) << 1; if (NumberOfRepeats > 1) L |= 01L; if (L == 0L) { Error(E_WARN|E_INTERNAL, "%s: %d: assertion failed: L is 0, can't tell if update", __FILE__, __LINE__ ); LastFID = 0L; L = (FID << 1); if (NumberOfRepeats > 1) L |= 01L; /* fake the beginning of an update */ if (LQTp_PutByte(db, 0, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } LastFID = FID; if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } if (L & 01L) { if (LQTp_PutLong(db, NumberOfRepeats, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } LastBlock = 0; for (; NumberOfRepeats != 0; --NumberOfRepeats) { if (CurrentPlace > NumberToWrite) { Error(E_BUG, "Word %ld: Entry for file %lu has more matches than expected", WID, FID); } #ifdef ASCIITRACE if (WordPlaces[CurrentPlace].BlockInFile < LastBlock) { Error(E_BUG, "LQT_WriteWordPlaces Sort WID %ld failed, backwards blocks", WID ); } else if (CurrentPlace && (WordPlaces[CurrentPlace].FID == WordPlaces[CurrentPlace - 1].FID) && (WordPlaces[CurrentPlace].BlockInFile == LastBlock) && (WordPlaces[CurrentPlace].WordInBlock <= WordPlaces[CurrentPlace - 1].WordInBlock)) { Error(E_BUG, "LQT_WriteWordPlaces Sort WID %ld failed, FID %ld: Blk %d: WIB %d <= %d", WID, FID, LastBlock, WordPlaces[CurrentPlace].WordInBlock, WordPlaces[CurrentPlace - 1].WordInBlock ); } #endif /* ASCIITRACE */ #ifdef DEBUGPLACES U = '{'; if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } #endif /* write the block number */ L = WordPlaces[CurrentPlace].BlockInFile - LastBlock; LastBlock += L; /* a b c d e f g h * * a=0, bc contain delta block, defg contain WIB, h is flag bit * a=1, b=1 cdefgh contains the start of delta block * a=1, b=0 cdefgh contains delta block * Need to leave bottom bit of Uchar as flag bit. * * 0 B B W W W W F * 1 1 B B B B B B, B continues * 1 0 B B B B B B */ /* Omit StuffBefore if * we didn't skip punctation, and StuffBefore is 1 * we *did* skip punctation, and StuffBefore is 2 * It looks as if the conditions can be combined... * don't be fooled! * This could most efficiently be done with ?:, but that's * a part of many C compilers that's not too reliable. */ if ((db->WordFlags & WPF_HASSTUFFBEFORE) == WPF_HASSTUFFBEFORE) { if (WordPlaces[CurrentPlace].Flags & WPF_LASTHADPUNCT) { if (WordPlaces[CurrentPlace].StuffBefore != 2) { WordPlaces[CurrentPlace].Flags |= WPF_HASSTUFFBEFORE; } } else { if (WordPlaces[CurrentPlace].StuffBefore != 1) { WordPlaces[CurrentPlace].Flags |= WPF_HASSTUFFBEFORE; } } } if (L <= 1 && WordPlaces[CurrentPlace].WordInBlock <= 31) { /* they both fit in one byte */ U = (WordPlaces[CurrentPlace].WordInBlock << 1); if (WordPlaces[CurrentPlace].Flags != LastFlags) { U |= 01; } if (L) { U |= 0100; } if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } else { /* We can still put at least part of L in the first byte, * but WordInBlock is in the next byte now. */ if (L <= 077) { /* L fits in one byte still */ U = ( (L & 077) | 0200); if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } else { /* multiple bytes for L */ U = ( (L & 077) | 0300); if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } /* now the rest of the Delta Block In File: */ L >>= 6; if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } /* now the word in block */ L = (WordPlaces[CurrentPlace].WordInBlock << 1); if (WordPlaces[CurrentPlace].Flags != LastFlags) { L |= 01; } if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } if (WordPlaces[CurrentPlace].Flags != LastFlags) { LastFlags = WordPlaces[CurrentPlace].Flags; if (LQTp_PutByte(db, LastFlags, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } } /* Even if there are flags, there still might not be a separate * entry for the number of preceding skipped bytes. */ if (WordPlaces[CurrentPlace].Flags & WPF_HASSTUFFBEFORE) { if (LQTp_PutByte(db, LQTpCombineFlagsAndStuff(&WordPlaces[CurrentPlace]), WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize ) < 0) { return CurrentPlace; } } #ifdef DEBUGPLACES U = '}'; if (LQTp_PutByte(db, db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return CurrentPlace; } #endif ++CurrentPlace; } if (CurrentPlace > LastPlace) { Error(E_BUG, "LQT_WriteWordPlaces: CurrentPlace %ld > LastPlace %ld", CurrentPlace, LastPlace); } } /* If necessary, terminate the stream. We look for a non-255 byte * as the last byte in the previous sequence, on reading, to spot the * last byte in the thingy. * Actually, the values are chosen so that this never happens in practice. */ if (q > Block && q[-1] == 0xFF && LastStart != 0L) { unsigned int U = 0; Error(E_WARN|E_INTERNAL, "NUL byte needed to terminate stream: WID %ld", WID ); if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength, &LastStart, &NextOffset, &NextSize) < 0) { return NumberToWrite - 1; /* i.e. failure */ } } if (LastStart) { /* NextOffset had better not be non-zero, but LQT_FlushBlock will * take care of it (we have wasted a block in that case!). * LastStart is zero if we fitted it all inside the WordInfo * block. */ LQT_FlushBlock( db, Block, q - Block, &NextOffset, &LastStart, WID ); } return NumberToWrite; } /* * LQT_AddWordPlaces * Database/Update, Database/Physical * * Adds the given Word Places to the database for the given WID. * This routine is fairly low-level, * and is made available in the API for efficiency. * You should not attempt to use it without looking at examples in * the lq-text clients that update the database, and also reading * the source of the function itself.

* * The number of places written. * */ API unsigned long LQT_AddWordPlaces(db, WordPlaces, WID, Offset, NumberToWrite) t_LQTEXT_Database *db; t_WordPlace *WordPlaces; t_WID WID; unsigned long Offset; unsigned long NumberToWrite; { unsigned char *Block; unsigned char *p; unsigned int BlockLength; unsigned long WrittenPlaces; Block = LQT_LastBlockInChain(db, WID, &Offset, &p, &BlockLength); if (!Block) { /* failed for some reason, make a new entry */ Error(E_BUG|E_WARN, "LQT_AddWordPlaces: LQT_LastBlockInChain failed"); return 0L; } /* put a null byte to reset the File Counter when reading back the * word places. */ if (p - Block >= BlockLength) { Error(E_BUG|E_FATAL, "%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, p - Block is %d", __FILE__, __LINE__, WID, Offset, p - Block ); } if (*p != 0xFF) { Error(E_BUG|E_FATAL, "%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, *p is %d (0%o), not 255 (0377), p - Block is %ld", __FILE__, __LINE__, WID, Offset, *p, *p, p - Block ); } *p = '\0'; ++p; /* We need at least 4 bytes to store a wordplace, so let's * make sure it's possible... the bytes are * TotalPlaces, FID, number in file (opt), BlockInFile, WordInBlock * 1 1 1 1 * In practice we generally need more... * However, because we compress the data, we often need only 2 bytes * for each place after that. * At any rate, we'll be very conservative here... * It's a good idea not to allocate a block we don't use, partly for * efficiency and partly because it will produce a warning! */ if (WIDBLOCKSIZE >= NumberToWrite * 2) { /* First try to fit them all in the same block: */ WrittenPlaces = LQT_WriteWordPlaces( db, WordPlaces, WID, Offset, /* block to flush */ Block, p, BlockLength, 0L, /* no next offset */ 0L, /* no next blocklength */ NumberToWrite ); } else { WrittenPlaces = 0L; } if (WrittenPlaces != NumberToWrite) { unsigned int NextLength; unsigned long NextBlock = LQT_FindFreeBlock( db, WID, &NextLength, (unsigned long) NumberToWrite * 5 /* a guess */ ); /* We needed to use extra blocks, they didn't all fit in * the one that was already partly in use. */ if (p[-1] != 0) { /* TODO understand why this is a zero and not a 255 */ /* I think it's because we append a nul byte to the end * of every sequence but I am not sure. */ Error(E_BUG|E_FATAL, "%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, p[-1] is %d (0%0), not 255 (0377)", __FILE__, __LINE__, WID, Offset, p[-1], p[-1] ); } WrittenPlaces = LQT_WriteWordPlaces( db, WordPlaces, WID, Offset, Block, p, BlockLength, NextBlock, NextLength, NumberToWrite ); if (WrittenPlaces != NumberToWrite) { Error(E_BUG, "%S; %d: didn't write places for %s, wrote %ld, not %ld", __FILE__, __LINE__, WrittenPlaces, NumberToWrite ); } } /* Now we need to update the index entry */ LQT_UpdateWIDMatchCount(db, WID, NumberToWrite); return NumberToWrite; }