/* sortwids.c -- Copyright 1996 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * sortwids -- allocate wids in sorted order * * $Id: sortwids.c,v 1.6 2001/05/31 03:50:13 liam Exp $ */ /* Porting this should be straight forward, but as written it * requires the ability to mmap() a file into memory. * Writing it to use read() and malloc() would not be a good * idea, because you'd run out of memory on some systems -- * it's sorting the widindex file, which is often quite large. * As a result, if you don't have mmap(), you'll need a cache. */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #ifndef ENOENT # include #endif #include #include #include #include #undef PRIVATE #include #define PRIVATE static #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #ifdef HAVE_UNISTD_H # include /* for SEEK_SET */ #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "smalldb.h" #include "pblock.h" #include "wordrules.h" #include "numbers.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" /*** Declarations: ***/ /** System calls and library routines: **/ /** Unix Library Functions: **/ char *progname = 0; /* Used for error messages */ static char *Revision = "$Revision: 1.6 $"; /** end of declarations... **/ PRIVATE int PutWordIntoIndex( #ifdef HAVE_PROTO t_LQTEXT_Database *, t_WordInfo * #endif ); PRIVATE int DecodeOneWidBlock( #ifdef HAVE_PROTO char *Buffer, unsigned char *Lengthp, char **Wordp, unsigned long *Countp #endif ); PRIVATE DBM *thedb = 0; PRIVATE void MakeNewWordMap(db) t_LQTEXT_Database *db; { long WordsSeen = 0; FILE *IndexFile; char Entry[WIDBLOCKSIZE]; char Word[1024]; t_WID LowestZeroCountWID = 0; LQT_ObtainWriteAccess(db); /* Remove the last block file; this file contains a pointer * to the end of each chain in "data" for each WID; it's generated * automatically by lqaddfile, and used as an optmisation. * But we've just sorted the WID table, so we no longer know * which WID each entry in the last block file belongs to. * The simplest thing would be to remove it. * So we do. */ if (unlink(db->LastBlockFile) < 0) { extern int errno; if (errno != ENOENT) { Error(E_SYS|E_FATAL, "Couldn't unlink \"%s\" (last blocks file)" ); } } IndexFile = LQU_fEopen(E_FATAL, db->WidIndexFile, "fixed size record word index", "r" ); /* for each word in the index */ while (fread(Entry, sizeof Entry, 1, IndexFile) > 0) { t_WordInfo WordInfo; if (!WordsSeen) { WordsSeen++; continue; /* block 0 unused */ } if (DecodeOneWidBlock( Entry, &(WordInfo.Length), &(WordInfo.Word), &(WordInfo.NumberOfWordPlaces) ) > 0) { if (!LowestZeroCountWID) { LowestZeroCountWID = WordsSeen; } continue; } if (WordInfo.NumberOfWordPlaces == 0) { /* The word has been deleted, so it has zero places now. * We should probably relclaim the space. */ if (!LowestZeroCountWID) { LowestZeroCountWID = WordsSeen; } continue; } /* We've found a valid WID... */ LowestZeroCountWID = 0; /* make a WordInfo structure */ WordInfo.Length = (int) Entry[0]; WordInfo.Word = Word; WordInfo.WID = WordsSeen; strncpy(Word, &Entry[1], WordInfo.Length); /* [1] coz of length byte */ Word[WordInfo.Length] = '\0'; /* put the word into the index */ if (PutWordIntoIndex(db, &WordInfo) < 0) { Error(E_BUG, "failed to put \"%s\" into index\n", Word); } WordsSeen++; } if (LowestZeroCountWID != 0) { /* the highest actual WID is one less than this. * NOTE: if all the words are deleted, there's nothing left! * that seems a little odd to me -- I didn't think of that when * I designed lq-text really. I hope it's OK. * Really, if you're deleting everything, you should rebuild * the index from scratch! * Oh well. Lets' try to reset the maximum WID. */ --LowestZeroCountWID; LQT_ResetMaximumWID(db, LowestZeroCountWID); } } /* returns 0 on success, * > 0 if wordlength is 0 (all other returns are meaningless then) * * length of word -> Lengthp * *Wordp set to point to start of word (NOT nul-terminated!) * *Countp set to the number of occurrences, total. */ PRIVATE int DecodeOneWidBlock(Buffer, Lengthp, Wordp, Countp) char *Buffer; unsigned char *Lengthp; char **Wordp; unsigned long *Countp; { char *q = Buffer; unsigned long Offset; long L; /* word length */ (void) LQT_sReadNumber(&q, &L, Buffer, WIDBLOCKSIZE); if (L == 0) { /* sort empty blocks to the end */ return 1000; } *Lengthp = (unsigned char) L; *Wordp = q; q += (*Lengthp); /* offset of matches */ LQT_sReadNumber( (unsigned char **) &q, &Offset, (unsigned char *) Buffer, WIDBLOCKSIZE ); /* number of matches */ /* q[0] is the least significant byte. What happened to PUT4/GET4? */ if (Offset == 0L) { LQT_sReadNumber( (unsigned char **) &q, Countp, (unsigned char *) Buffer, WIDBLOCKSIZE ); } else { L = (unsigned int) (q[3] & 255); L <<= 8; L |= (unsigned int) (q[2] & 255); L <<= 8; L |= (unsigned int) (q[1] & 255); L <<= 8; L |= (unsigned int) (q[0] & 255); *Countp = L; } return 0; } PRIVATE int CompareWIDBlocks(pp1, pp2) char *pp1, *pp2; { unsigned char l1, l2; char *s1, *s2; /* Compare two words. * Return < 0 if pp1 is smaller or shorter (i.e. a prefix of pp2), * 0 if they are the same * > 0 if pp1 is greater or longer * * Although we compare the strings, there are two complications: * (1) some blocks are unused, and the wordlength is then set * to zero. We want to sort those to the very end. * (2) some blocks refer to words that used to occur, but now * have zero occurrences. We also sort these to the very end. * * Since all the words are supposed to be distinct, * a zero return should only happen if the blocks both point * to words that don't actually occur in the database, or both * have zero length (i.e. unused blocks). * * In practice, DecodeOneWidBlock() returns non-zero if * the block has a zero wordlength, and we return straight away. * That means that this is not a stable sort comparison: * two blocks with zero length randomly get assigned to be * one greater than the other depending on the order in which * they're passed (pp1 will always sort after pp2). * This doesn't matter, as we don't care about the order of those * blocks, although qsort() may make some extra comparisons as * a result. * * After we've sorted everything, we reclaim some of the space. */ { unsigned long Count1, Count2; if (DecodeOneWidBlock(pp1, &l1, &s1, &Count1) > 0) { /* move block 1 to the end, it's faulty */ return 1000; } if (DecodeOneWidBlock(pp2, &l2, &s2, &Count2) > 0) { /* sort block 2 to the end */ return -1000; } if (Count1 == 0) { if (Count2 == 0) { /* treat them as equal */ return 0; } else { return 100000; } } if (Count2 == 0) { return -100000; } } /* Now we have two words with non-zero length, and they do * both have matches in the database. */ { register int i, len; len = (l1 < l2) ? l1 : l2; i = strncmp(s1, s2, len); if (i == 0) { if (l1 == l2) { Error(E_WARN|E_INTERNAL, "Duplicated word in index: %*.*s", l1, l1, s1 ); return 0; } return (l1 < l2) ? -1 : 1; } else { return i; } } } int main(argc, argv) int argc; char *argv[]; { /* stuff for getopt(3): */ extern int getopt(); /* extern int optind */ /* extern char *optarg; */ int ch; /* For getopt(3) */ int ErrorFlag = 0; /* For getopt(3) */ t_LQTEXT_Database *db; /* The database we'll read from */ t_lqdbOptions *Options; /* The user's configuration & preferences */ int Widfd; caddr_t theData; struct stat statbuf; progname = argv[0]; /* I see this as a library program, so I am leaving the full * path. lqaddfile(1L) and lqphrase(1L) set progname to be * the filename of the command, rather than the full pathname. */ Options = LQT_InitFromArgv(argc, argv); /* Deal with any arguments that are understood by all lqtext * programs. */ while ((ch = getopt(argc, argv, "VvxZz:")) != EOF) { switch (ch) { case 'V': fprintf(stderr, "%s version %s\n", progname, Revision); break; case 'x': ErrorFlag++; break; case '?': ErrorFlag++; break; case 'z': case 'Z': break; /* done by LQT_InitFromArgv(); */ } } if (ErrorFlag) { fprintf(stderr, "%s: sort words by WID\n", progname); LQT_PrintDefaultUsage(Options); /* LQT_PrintDefaultUsage() prints the list of the standard options. */ exit(1); } db = LQT_OpenDatabase(Options, O_RDONLY, 0644); /* We are going to lie to the database and change its files without * telling it. In particular, we're going to write all over the * word table, and then replace the dbm file that maps words to WIDS. * So AVOID ALL LQT_ functions....! BE CAREFUL...! * We'll basically just use the db for filenames. * This program should go away. */ Widfd = LQU_Eopen( E_FATAL|E_SYS, db->WidIndexFile, "WID (Word Identifier) Index file", O_RDWR, 0 /* we are not creating it */ ); /* map the file into memory */ { if (fstat(Widfd, &statbuf) < 0) { Error(E_SYS|E_FATAL, "Couldn't fstat %s", db->WidIndexFile ); } #ifndef MAP_FILE # define MAP_FILE 0 #endif theData = mmap( 0, statbuf.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_SHARED, Widfd, 0 ); if (theData == (caddr_t) -1) { Error(E_FATAL|E_SYS, "%s: mmap failed", db->WidIndexFile ); } } { unsigned long nblocks = (statbuf.st_size / WIDBLOCKSIZE); /* try to get rid of trailing empty blocks that * were put there by the lqtext cache for efficiency */ register char *p; for (p = &theData[(nblocks - 1) * WIDBLOCKSIZE]; p >theData; p -= WIDBLOCKSIZE ) { if (*p == 0) { --nblocks; } else { break; } } /* sort the WID file */ qsort( &theData[WIDBLOCKSIZE], /* first block is always empty */ nblocks - 1, WIDBLOCKSIZE, CompareWIDBlocks ); } /* write out the results by unmapping it */ if (munmap(theData, statbuf.st_size) < 0) { Error(E_FATAL|E_SYS, "problem unmapping %lu bytes of memory for wid file", statbuf.st_size ); } (void) close(Widfd); /* make a new wid->word map */ /* remove the old word index, if there was one */ /* note: this is tricky, because we don't know the suffix... */ { char tmp[8192]; (void) sprintf(tmp, "cd \"%s\" && /bin/rm -f %s*", db->DatabaseDirectory, db->WordIndex ); (void) system(tmp); } MakeNewWordMap(db); /* close the new wid->word map */ LQT_CloseKeyValueDatabase(thedb); LQT_SyncAndCloseAllKeyValueDatabases(db); LQT_CloseDatabase(db); exit(0); /*NOTREACHED*/ return(0); /* for lint and gcc -Wall*/ } PRIVATE unsigned char * GetEntry(Name, f, n) char *Name; int f; /* Unix file descriptor */ t_WID n; { static unsigned char Buf[WIDBLOCKSIZE * 1024]; static unsigned long StartPos = 0L; static unsigned long EndPos = 0L; unsigned long Where = n * WIDBLOCKSIZE; if (Where + WIDBLOCKSIZE > EndPos + 1 || Where < StartPos) { int i; (void) LQU_Elseek(E_FATAL, Name, "Word Index File", f, Where, SEEK_SET); StartPos = Where; if ((i = read(f, (char *) Buf, sizeof Buf)) < 0) { Error(E_FATAL|E_SYS, "WIDINDEX read(fd=%d, buf, n=%d) failed", f, sizeof Buf ); } EndPos = Where + i; } return &Buf[Where - StartPos]; } PRIVATE int PutWordIntoIndex(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { unsigned char NumBuf[sizeof(t_WID) + 1]; unsigned char *q = NumBuf; datum key, data; /** First, write the WID itself, so we can go from Word to WID */ key.dptr = WordInfo->Word; key.dsize = WordInfo->Length; (void) LQT_sWriteNumber(&q, WordInfo->WID, NumBuf, sizeof NumBuf); data.dptr = (char *) NumBuf; data.dsize = q - NumBuf; /* contact database server */ if (!thedb) { thedb = LQT_OpenKeyValueDatabase(db, db->WordIndex); if (thedb == (DBM *) 0) { Error(E_FATAL|E_SYS, "Couldn't create new index \"%s\"", db->WordIndex ); } } return dbm_store(thedb, key, data, DBM_REPLACE); }