/* wpblock.c -- Copyright 1989, 1983, 1994 Liam R. E. Quin.
* All Rights Reserved.
* This code is NOT in the public domain.
* See the file COPYRIGHT for full details.
*/
/* $Id: wpblock.c,v 1.18 1996/08/20 18:36:36 lee Exp $
*
*/
#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"
#include /* stderr, also for fileinfo.h */
#include
#include "fileinfo.h" /* for wordinfo.h */
#include "wordinfo.h"
#include "wordrules.h"
#include "pblock.h"
#include "putbyte.h"
#include "emalloc.h"
#include "liblqtext.h"
#include "lqtrace.h"
/** Unix system calls that need to be declared: **/
/** C library functions that need to be declared: **/
/** lqtext library functions that need to be declared: **/
/** Functions within this file that need to be declared: **/
/** **/
/*
* LQT_Writepblock
* Database/Update, Database/Physical
*
* Write out an entire (presumably new) data entry, and
* return a disk pointer to the start of the chain.
*
* the byte offset of the first block in the newly created chain
*
* Fatal (E_BUG) error on format or consistency check, etc.
*
*/
API unsigned long
LQT_Writepblock(db, WordInfo, pblock)
t_LQTEXT_Database *db;
t_WordInfo *WordInfo;
t_pblock *pblock;
{
unsigned int BlockLength;
unsigned long placesWritten = 0;
if (WordInfo->DataBlock) {
(void) efree((char *) WordInfo->DataBlock);
WordInfo->DataBlock = (unsigned char *) 0;
}
#if 0
if (pblock->NumberOfWordPlaces < WIDBLOCKSIZE * 10) {
WordInfo->Offset = pblock->ChainStart = 0L;
LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock);
placesWritten = LQT_WriteWordPlaces(
db,,
pblock->WordPlaces,
WordInfo->WID,
(unsigned long) 0L, /* first block is in widindex, not in "data" */
WordInfo->DataBlock,
(unsigned char *) WordInfo->WordPlaceStart,
(unsigned) WIDBLOCKSIZE,
0L,
0,
pblock->NumberOfWordPlaces
);
}
#endif
if (placesWritten != pblock->NumberOfWordPlaces) {
unsigned long BytesWanted =
pblock->NumberOfWordPlaces * 3; /* a guess */
WordInfo->Offset = pblock->ChainStart =
LQT_FindFreeBlock(db, WordInfo->WID, &BlockLength, BytesWanted);
LQT_MakeWordInfoBlockHeader(db, WordInfo, pblock);
placesWritten = LQT_WriteWordPlaces(
db,
pblock->WordPlaces,
WordInfo->WID,
(unsigned long) 0L, /* first block is in widindex, not in "data" */
WordInfo->DataBlock,
(unsigned char *) WordInfo->WordPlaceStart,
(unsigned) WIDBLOCKSIZE,
WordInfo->Offset,
BlockLength,
pblock->NumberOfWordPlaces
);
}
if (placesWritten != pblock->NumberOfWordPlaces) {
Error(E_BUG,
"LQT_Writepblock: WID %d (%*s): wrote %lu places != %lu",
WordInfo->WID,
WordInfo->Length,
WordInfo->Word,
placesWritten,
pblock->NumberOfWordPlaces
);
}
#ifdef ASCIITRACE
if (LQT_TraceFlagsSet(LQTRACE_PUTPLACES|LQTRACE_WORDINFO)) {
t_pblock *p2;
register long place;
p2 = LQT_Getpblock(db, WordInfo);
if (p2->NumberOfWordPlaces != pblock->NumberOfWordPlaces) {
Error(E_FATAL|E_BUG,
"%s: %d: places differ %ld vs. %ld",
__FILE__, __LINE__,
p2->NumberOfWordPlaces,
pblock->NumberOfWordPlaces
);
}
for (place = 0; place < p2->NumberOfWordPlaces; place++) {
#define PCMP(p1,p2,place) \
((p2->WordPlaces[place].FID == p2->WordPlaces[place].FID) && \
(p2->WordPlaces[place].BlockInFile == p2->WordPlaces[place].BlockInFile) && \
(p2->WordPlaces[place].WordInBlock == p2->WordPlaces[place].WordInBlock) && \
(p2->WordPlaces[place].Flags == p2->WordPlaces[place].Flags) && \
(p2->WordPlaces[place].StuffBefore == p2->WordPlaces[place].StuffBefore))
if (!PCMP(pblock, p2, place)) {
Error(E_BUG|E_FATAL,
"%s: %d: !PCMP(%ld: %ld,%ld,%ld,0%o,%d/%ld,%ld,%ld,0%o,%d)",
__FILE__, __LINE__,
place,
pblock->WordPlaces[place].FID,
pblock->WordPlaces[place].BlockInFile,
pblock->WordPlaces[place].WordInBlock,
pblock->WordPlaces[place].Flags,
pblock->WordPlaces[place].StuffBefore,
p2->WordPlaces[place].FID,
p2->WordPlaces[place].BlockInFile,
p2->WordPlaces[place].WordInBlock,
p2->WordPlaces[place].Flags,
p2->WordPlaces[place].StuffBefore
);
}
}
(void) efree((char *) p2);
}
#endif
return WordInfo->Offset;
}
/*
* WordPlaces are now stored as sequences, as follows: Per F:
* FID*2 -- 1, 2, 3 (usually, for FID) or 4 bytes 1-5
* (very, very occasionaly a variable-size number may be 5 bytes long.)
* . the bottom bit in the stored number determines whether there
* is more than one FID to follow
* Per M:
* Number of following places (only if prev. bit was 1) -- 1 byte 0-1
* For each following entry:-
* . for each of the following places:
* Block In File (long, 1-5 bytes, usually 1) 1-5
* Word In Block -- always 1 byte 0-1
* the bottom bit of this says if there are flags
* (if Block in file and word in block both fit into one byte, they
* are combined, saving 1 byte; sometimes this adds a byte.)
* Flags -- always 1 byte, if present 0-1
* (flags stored only if different from previous entry)
* Stuff Before -- 1 byte 0-1
* (if there are no flags, there's no Stuff Before byte, and
* we use the default value of 1, or 2 if punctuation was skipped)
* Stuff After 0-1
*
* Hence: each sub-place takes from 1 to 10 bytes;
* each Place sequence takes from 3
* to (4 + 1) + 255 * (2..7) bytes.
* In most (I guess > 7/10) cases, flags will be 0, and
* StuffBefore will be the default of 1.
*
* In practice, though, we store the difference since the last block-in-file,
* and the difference since the last FID, so that the numbers are usually
* on the small side.
*
* I am hoping, of course, that the extra information stored is
* worth while!
* It might be possible to coalesce WordInBlock and BlockInFile using
* delta modulation -- i.e., storing the increment from the previous. In
* this case, a flag bit could mean that those two values each occupy a
* nibble in a single byte. This is in fact the case, and is implemented.
* It really is worth while keeping the format as simple as possible,
* as this speeds retrieval.
*/
/*
* LQT_WriteWordPlaces
* Database/Update, Database/Physical
*
* Writes the given WordPlaces to disk.
* The given LastStart argument should be zero if the given
* Block pointer refers to data that is not to be stored in the
* overflow file (`data'). This will be the case when the first few
* matches are to be written into the widindex entry. If the LastStart
* argument is non-zero, it is the block number that will be passed as
* an argument to LQT_WriteBlock to save the block when it is full.
* The given NextOffset can either be zero or it can be the block
* offset in the data overflow file of a block that has been allocated
* using LQT_FindFreeBlock; in the latter case, the NextLength argument
* is also passed on to LQT_WriteWordPlaces.
*
*
* the number of words added on success;
* -1 if the file couldn't be opened.
*
*
* This routine is fairly low-level,
* and is made available in the API for efficiency.
* You should not attempt to use it without looking at examples in
* the lq-text clients that update the database, and also reading
* the source of the function itself.
*
* Warns if the file can't be opened.
*
*/
API unsigned long
LQT_WriteWordPlaces(
db,
WordPlaces,
WID,
LastStart,
Block, DataStart, BlockLength,
NextOffset, NextSize,
NumberToWrite
)
t_LQTEXT_Database *db;
t_WordPlace *WordPlaces;
t_WID WID;
unsigned long LastStart;
unsigned char *Block;
unsigned char *DataStart;
unsigned int BlockLength;
unsigned long NextOffset;
unsigned long NextSize;
unsigned long NumberToWrite;
{
unsigned char *q = DataStart;
unsigned long L;
int CurrentPlace = 0;
t_FID LastFID = 0;
unsigned long LastBlock = 0L;
unsigned char LastFlags = 0;
#ifdef ALWAYSSORT_NEVERDEFINED /* i.e. this code unused -- not needed */
/* Sort the pblock to simplify subsequent accesses,
* and also to allow more space-efficient encoding, recording the change
* (increment) since the previous FID or Block in the list, instead of
* the actual number. The WriteNumber package works much better if
* most numbers are small.
*/
if (NumberToWrite > 1) {
LQT_SortWordPlaces(db, NumberToWrite, WordPlaces);
}
#endif
/* invalidate the lastblock cache: */
{
unsigned long tmp = 0L;
LQT_SetLastBlockInChain(db, WID, &tmp, &Block[0], Block);
}
while (CurrentPlace < 0 || CurrentPlace < NumberToWrite) {
unsigned long NumberOfRepeats;
unsigned char U;
int LastPlace;
t_FID FID;
FID = WordPlaces[CurrentPlace].FID;
if (FID == 0) {
Error(E_BUG, "LQT_WriteWordPlaces WID %ld, FID %ld is Zero!",
WID, CurrentPlace
);
}
/* Determine the number of Places in the same file;
* We use a long here. Previous versions used a single byte,
* and if there were more than 255 occurrences of a word in the
* same file, used multiple sequences. Multiple sequences can now
* also occur through updates to the index, and the two cases need
* to be distinguished, because in the update case the running
* totals (LastFID) are reset to 0.
*/
NumberOfRepeats = 0;
LastPlace = CurrentPlace;
while (LastPlace < NumberToWrite) {
if (WordPlaces[LastPlace].FID != FID) {
break;
}
++NumberOfRepeats;
++LastPlace;
}
L = (FID - LastFID) << 1;
if (NumberOfRepeats > 1) L |= 01L;
if (L == 0L) {
Error(E_WARN|E_INTERNAL,
"%s: %d: assertion failed: L is 0, can't tell if update",
__FILE__, __LINE__
);
LastFID = 0L;
L = (FID << 1);
if (NumberOfRepeats > 1) L |= 01L;
/* fake the beginning of an update */
if (LQTp_PutByte(db, 0, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
}
LastFID = FID;
if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
if (L & 01L) {
if (LQTp_PutLong(db, NumberOfRepeats, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
}
LastBlock = 0;
for (; NumberOfRepeats != 0; --NumberOfRepeats) {
if (CurrentPlace > NumberToWrite) {
Error(E_BUG,
"Word %ld: Entry for file %lu has more matches than expected",
WID, FID);
}
#ifdef ASCIITRACE
if (WordPlaces[CurrentPlace].BlockInFile < LastBlock) {
Error(E_BUG,
"LQT_WriteWordPlaces Sort WID %ld failed, backwards blocks",
WID
);
} else if (CurrentPlace &&
(WordPlaces[CurrentPlace].FID ==
WordPlaces[CurrentPlace - 1].FID) &&
(WordPlaces[CurrentPlace].BlockInFile == LastBlock) &&
(WordPlaces[CurrentPlace].WordInBlock <=
WordPlaces[CurrentPlace - 1].WordInBlock)) {
Error(E_BUG,
"LQT_WriteWordPlaces Sort WID %ld failed, FID %ld: Blk %d: WIB %d <= %d",
WID, FID, LastBlock, WordPlaces[CurrentPlace].WordInBlock,
WordPlaces[CurrentPlace - 1].WordInBlock
);
}
#endif /* ASCIITRACE */
#ifdef DEBUGPLACES
U = '{';
if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
#endif
/* write the block number */
L = WordPlaces[CurrentPlace].BlockInFile - LastBlock;
LastBlock += L;
/* a b c d e f g h
*
* a=0, bc contain delta block, defg contain WIB, h is flag bit
* a=1, b=1 cdefgh contains the start of delta block
* a=1, b=0 cdefgh contains delta block
* Need to leave bottom bit of Uchar as flag bit.
*
* 0 B B W W W W F
* 1 1 B B B B B B, B continues
* 1 0 B B B B B B
*/
/* Omit StuffBefore if
* we didn't skip punctation, and StuffBefore is 1
* we *did* skip punctation, and StuffBefore is 2
* It looks as if the conditions can be combined...
* don't be fooled!
* This could most efficiently be done with ?:, but that's
* a part of many C compilers that's not too reliable.
*/
if ((db->WordFlags & WPF_HASSTUFFBEFORE) == WPF_HASSTUFFBEFORE) {
if (WordPlaces[CurrentPlace].Flags & WPF_LASTHADPUNCT) {
if (WordPlaces[CurrentPlace].StuffBefore != 2) {
WordPlaces[CurrentPlace].Flags |= WPF_HASSTUFFBEFORE;
}
} else {
if (WordPlaces[CurrentPlace].StuffBefore != 1) {
WordPlaces[CurrentPlace].Flags |= WPF_HASSTUFFBEFORE;
}
}
}
if (L <= 1 && WordPlaces[CurrentPlace].WordInBlock <= 31) {
/* they both fit in one byte */
U = (WordPlaces[CurrentPlace].WordInBlock << 1);
if (WordPlaces[CurrentPlace].Flags != LastFlags) {
U |= 01;
}
if (L) {
U |= 0100;
}
if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
} else {
/* We can still put at least part of L in the first byte,
* but WordInBlock is in the next byte now.
*/
if (L <= 077) { /* L fits in one byte still */
U = ( (L & 077) | 0200);
if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
} else { /* multiple bytes for L */
U = ( (L & 077) | 0300);
if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
/* now the rest of the Delta Block In File: */
L >>= 6;
if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
}
/* now the word in block */
L = (WordPlaces[CurrentPlace].WordInBlock << 1);
if (WordPlaces[CurrentPlace].Flags != LastFlags) {
L |= 01;
}
if (LQTp_PutLong(db, L, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
}
if (WordPlaces[CurrentPlace].Flags != LastFlags) {
LastFlags = WordPlaces[CurrentPlace].Flags;
if (LQTp_PutByte(db, LastFlags, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
}
/* Even if there are flags, there still might not be a separate
* entry for the number of preceding skipped bytes.
*/
if (WordPlaces[CurrentPlace].Flags & WPF_HASSTUFFBEFORE) {
if (LQTp_PutByte(db,
LQTpCombineFlagsAndStuff(&WordPlaces[CurrentPlace]),
WID,
&q,
&Block,
&BlockLength, &LastStart,
&NextOffset, &NextSize
) < 0) {
return CurrentPlace;
}
}
#ifdef DEBUGPLACES
U = '}';
if (LQTp_PutByte(db, db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return CurrentPlace;
}
#endif
++CurrentPlace;
}
if (CurrentPlace > LastPlace) {
Error(E_BUG, "LQT_WriteWordPlaces: CurrentPlace %ld > LastPlace %ld",
CurrentPlace, LastPlace);
}
}
/* If necessary, terminate the stream. We look for a non-255 byte
* as the last byte in the previous sequence, on reading, to spot the
* last byte in the thingy.
* Actually, the values are chosen so that this never happens in practice.
*/
if (q > Block && q[-1] == 0xFF && LastStart != 0L) {
unsigned int U = 0;
Error(E_WARN|E_INTERNAL,
"NUL byte needed to terminate stream: WID %ld",
WID
);
if (LQTp_PutByte(db, U, WID, &q, &Block, &BlockLength,
&LastStart, &NextOffset, &NextSize) < 0) {
return NumberToWrite - 1; /* i.e. failure */
}
}
if (LastStart) {
/* NextOffset had better not be non-zero, but LQT_FlushBlock will
* take care of it (we have wasted a block in that case!).
* LastStart is zero if we fitted it all inside the WordInfo
* block.
*/
LQT_FlushBlock(
db,
Block,
q - Block,
&NextOffset,
&LastStart,
WID
);
}
return NumberToWrite;
}
/*
* LQT_AddWordPlaces
* Database/Update, Database/Physical
*
* Adds the given Word Places to the database for the given WID.
* This routine is fairly low-level,
* and is made available in the API for efficiency.
* You should not attempt to use it without looking at examples in
* the lq-text clients that update the database, and also reading
* the source of the function itself.
*
* The number of places written.
*
*/
API unsigned long
LQT_AddWordPlaces(db, WordPlaces, WID, Offset, NumberToWrite)
t_LQTEXT_Database *db;
t_WordPlace *WordPlaces;
t_WID WID;
unsigned long Offset;
unsigned long NumberToWrite;
{
unsigned char *Block;
unsigned char *p;
unsigned int BlockLength;
unsigned long WrittenPlaces;
Block = LQT_LastBlockInChain(db, WID, &Offset, &p, &BlockLength);
if (!Block) {
/* failed for some reason, make a new entry */
Error(E_BUG|E_WARN, "LQT_AddWordPlaces: LQT_LastBlockInChain failed");
return 0L;
}
/* put a null byte to reset the File Counter when reading back the
* word places.
*/
if (p - Block >= BlockLength) {
Error(E_BUG|E_FATAL,
"%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, p - Block is %d",
__FILE__, __LINE__, WID, Offset, p - Block
);
}
if (*p != 0xFF) {
Error(E_BUG|E_FATAL,
"%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, *p is %d (0%o), not 255 (0377), p - Block is %ld",
__FILE__, __LINE__, WID, Offset, *p, *p, p - Block
);
}
*p = '\0';
++p;
/* We need at least 4 bytes to store a wordplace, so let's
* make sure it's possible... the bytes are
* TotalPlaces, FID, number in file (opt), BlockInFile, WordInBlock
* 1 1 1 1
* In practice we generally need more...
* However, because we compress the data, we often need only 2 bytes
* for each place after that.
* At any rate, we'll be very conservative here...
* It's a good idea not to allocate a block we don't use, partly for
* efficiency and partly because it will produce a warning!
*/
if (WIDBLOCKSIZE >= NumberToWrite * 2) {
/* First try to fit them all in the same block: */
WrittenPlaces = LQT_WriteWordPlaces(
db,
WordPlaces,
WID,
Offset, /* block to flush */
Block,
p,
BlockLength,
0L, /* no next offset */
0L, /* no next blocklength */
NumberToWrite
);
} else {
WrittenPlaces = 0L;
}
if (WrittenPlaces != NumberToWrite) {
unsigned int NextLength;
unsigned long NextBlock = LQT_FindFreeBlock(
db,
WID,
&NextLength,
(unsigned long) NumberToWrite * 5 /* a guess */
);
/* We needed to use extra blocks, they didn't all fit in
* the one that was already partly in use.
*/
if (p[-1] != 0) {
/* TODO understand why this is a zero and not a 255 */
/* I think it's because we append a nul byte to the end
* of every sequence but I am not sure.
*/
Error(E_BUG|E_FATAL,
"%s: %d: LQT_AddWordPlaces WID %ld lastblock %ld, p[-1] is %d (0%0), not 255 (0377)",
__FILE__, __LINE__, WID, Offset, p[-1], p[-1]
);
}
WrittenPlaces = LQT_WriteWordPlaces(
db,
WordPlaces,
WID,
Offset,
Block,
p,
BlockLength,
NextBlock,
NextLength,
NumberToWrite
);
if (WrittenPlaces != NumberToWrite) {
Error(E_BUG,
"%S; %d: didn't write places for %s, wrote %ld, not %ld",
__FILE__, __LINE__, WrittenPlaces, NumberToWrite
);
}
}
/* Now we need to update the index entry */
LQT_UpdateWIDMatchCount(db, WID, NumberToWrite);
return NumberToWrite;
}