/* sortwids.c -- Copyright 1996 Liam R. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * sortwids -- allocate wids in sorted order
 *
 * $Id: sortwids.c,v 1.6 2001/05/31 03:50:13 liam Exp $
 */

/* Porting this should be straight forward, but as written it
 * requires the ability to mmap() a file into memory.
 * Writing it to use read() and malloc() would not be a good
 * idea, because you'd run out of memory on some systems --
 * it's sorting the widindex file, which is often quite large.
 * As a result, if you don't have mmap(), you'll need a cache.
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"
#ifndef ENOENT
# include <sys/errno.h>
#endif

#include <stdio.h>
#include <sys/types.h>
#include <ctype.h>
#include <sys/stat.h>

#undef PRIVATE
#include <sys/mman.h>
#define PRIVATE static

#ifdef HAVE_FCNTL_H
# ifdef HAVE_SYSV_FCNTL_H
#  include <sys/stat.h>
# endif
# include <fcntl.h>
#endif

#ifdef HAVE_UNISTD_H
# include <unistd.h> /* for SEEK_SET */
#endif

#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#else
# include <malloc.h>
#endif


#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif


#include "fileinfo.h"
#include "wordinfo.h"
#include "smalldb.h"
#include "pblock.h"
#include "wordrules.h"
#include "numbers.h"
#include "emalloc.h"
#include "lqutil.h"
#include "liblqtext.h"
#include "lqtrace.h"

/*** Declarations: ***/
/** System calls and library routines: **/
/** Unix Library Functions: **/

char *progname = 0;
    /* Used for error messages */

static char *Revision = "$Revision: 1.6 $";

/** end of declarations... **/

PRIVATE int PutWordIntoIndex(
#ifdef HAVE_PROTO
    t_LQTEXT_Database *,
    t_WordInfo *
#endif
);

PRIVATE int DecodeOneWidBlock(
#ifdef HAVE_PROTO
    char *Buffer,
    unsigned char *Lengthp,
    char **Wordp,
    unsigned long *Countp
#endif
);


PRIVATE DBM *thedb = 0;

PRIVATE void
MakeNewWordMap(db)
    t_LQTEXT_Database *db;
{
    long WordsSeen = 0;
    FILE *IndexFile;
    char Entry[WIDBLOCKSIZE];
    char Word[1024];
    t_WID LowestZeroCountWID = 0;

    LQT_ObtainWriteAccess(db);

    /* Remove the last block file; this file contains a pointer
     * to the end of each chain in "data" for each WID; it's generated
     * automatically by lqaddfile, and used as an optmisation.
     * But we've just sorted the WID table, so we no longer know
     * which WID each entry in the last block file belongs to.
     * The simplest thing would be to remove it.
     * So we do.
     */
    if (unlink(db->LastBlockFile) < 0) {
	extern int errno;

	if (errno != ENOENT) {
	    Error(E_SYS|E_FATAL,
		"Couldn't unlink \"%s\" (last blocks file)"
	    );
	}
    }


    IndexFile = LQU_fEopen(E_FATAL,
	db->WidIndexFile,
	"fixed size record word index",
	"r"
    );

    /* for each word in the index */

    while (fread(Entry, sizeof Entry, 1, IndexFile) > 0) {
	t_WordInfo WordInfo;

	if (!WordsSeen) {
	    WordsSeen++;
	    continue; /* block 0 unused */
	}

	if (DecodeOneWidBlock(
	    Entry,
	    &(WordInfo.Length),
	    &(WordInfo.Word),
	    &(WordInfo.NumberOfWordPlaces)
	) > 0) {

	    if (!LowestZeroCountWID) {
		LowestZeroCountWID = WordsSeen;
	    }

	    continue;
	}

	if (WordInfo.NumberOfWordPlaces == 0) {
	    /* The word has been deleted, so it has zero places now.
	     * We should probably relclaim the space.
	     */
	    if (!LowestZeroCountWID) {
		LowestZeroCountWID = WordsSeen;
	    }

	    continue;
	}

	/* We've found a valid WID... */

	LowestZeroCountWID = 0;

	/* make a WordInfo structure */
	WordInfo.Length = (int) Entry[0];
	WordInfo.Word = Word;
	WordInfo.WID = WordsSeen;
	strncpy(Word, &Entry[1], WordInfo.Length); /* [1] coz of length byte */
	Word[WordInfo.Length] = '\0';

	/* put the word into the index */
	if (PutWordIntoIndex(db, &WordInfo) < 0) {
	    Error(E_BUG, "failed to put \"%s\" into index\n", Word);
	}

	WordsSeen++;
    }

    if (LowestZeroCountWID != 0) {
	/* the highest actual WID is one less than this.
	 * NOTE: if all the words are deleted, there's nothing left!
	 * that seems a little odd to me -- I didn't think of that when
	 * I designed lq-text really.  I hope it's OK.
	 * Really, if you're deleting everything, you should rebuild
	 * the index from scratch!
	 * Oh well.  Lets' try to reset the maximum WID.
	 */
	--LowestZeroCountWID;
	LQT_ResetMaximumWID(db, LowestZeroCountWID);
    }
}

/* returns 0 on success,
 * > 0 if wordlength is 0 (all other returns are meaningless then)
 * 
 * length of word -> Lengthp
 * *Wordp set to point to start of word (NOT nul-terminated!)
 * *Countp set to the number of occurrences, total.
 */

PRIVATE int
DecodeOneWidBlock(Buffer, Lengthp, Wordp, Countp)
    char *Buffer;
    unsigned char *Lengthp;
    char **Wordp;
    unsigned long *Countp;
{
    char *q = Buffer;
    unsigned long Offset;
    long L;

    /* word length */
    (void) LQT_sReadNumber(&q, &L, Buffer, WIDBLOCKSIZE);
    if (L == 0) { /* sort empty blocks to the end */
	return 1000;
    }
    *Lengthp = (unsigned char) L;

    *Wordp = q;
    q += (*Lengthp);

    /* offset of matches */
    LQT_sReadNumber(
	(unsigned char **) &q,
	&Offset,
	(unsigned char *) Buffer,
	WIDBLOCKSIZE
    );

    /* number of matches */
    /* q[0] is  the least significant byte.  What happened to PUT4/GET4? */

    if (Offset == 0L) {
	LQT_sReadNumber(
	    (unsigned char **) &q,
	    Countp,
	    (unsigned char *) Buffer,
	    WIDBLOCKSIZE
	);
    } else {
	L = (unsigned int) (q[3] & 255);
	L <<= 8;
	L |= (unsigned int) (q[2] & 255);
	L <<= 8;
	L |= (unsigned int) (q[1] & 255);
	L <<= 8;
	L |= (unsigned int) (q[0] & 255);
	*Countp = L;
    }
    return 0;
}

PRIVATE int
CompareWIDBlocks(pp1, pp2)
    char *pp1, *pp2;
{
    unsigned char l1, l2;
    char *s1, *s2;

    /* Compare two words.
     * Return < 0 if pp1 is smaller or shorter (i.e. a prefix of pp2),
     * 0 if they are the same
     * > 0 if pp1 is greater or longer
     *
     * Although we compare the strings, there are two complications:
     * (1) some blocks are unused, and the wordlength is then set
     * to zero.  We want to sort those to the very end.
     * (2) some blocks refer to words that used to occur, but now
     * have zero occurrences.  We also sort these to the very end.
     *
     * Since all the words are supposed to be distinct,
     * a zero return should only happen if the blocks both point
     * to words that don't actually occur in the database, or both
     * have zero length (i.e. unused blocks).
     *
     * In practice, DecodeOneWidBlock() returns non-zero if
     * the block has a zero wordlength, and we return straight away.
     * That means that this is not a stable sort comparison:
     * two blocks with zero length randomly get assigned to be
     * one greater than the other depending on the order in which
     * they're passed (pp1 will always sort after pp2).
     * This doesn't matter, as we don't care about the order of those
     * blocks, although qsort() may make some extra comparisons as
     * a result.
     *
     * After we've sorted everything, we reclaim some of the space.
     */

    {
	unsigned long Count1, Count2;

	if (DecodeOneWidBlock(pp1, &l1, &s1, &Count1) > 0) {
	    /* move block 1 to the end, it's faulty */
	    return 1000;
	}

	if (DecodeOneWidBlock(pp2, &l2, &s2, &Count2) > 0) {
	    /* sort block 2 to the end */
	    return -1000;
	}

	if (Count1 == 0) {
	    if (Count2 == 0) { /* treat them as equal */
		return 0;
	    } else {
		return 100000;
	    }
	}

	if (Count2 == 0) {
	    return -100000;
	}
    }


    /* Now we have two words with non-zero length, and they do
     * both have matches in the database.
     */

    {
	register int i, len;

	len = (l1 < l2) ? l1 : l2;

	i = strncmp(s1, s2, len);

	if (i == 0) {
	    if (l1 == l2) {
		Error(E_WARN|E_INTERNAL,
		    "Duplicated word in index: %*.*s",
		    l1, l1, s1
		);
		return 0;
	    }
	    return (l1  < l2) ? -1 : 1;
	} else {
	    return i;
	}
    }
}

int
main(argc, argv)
    int argc;
    char *argv[];
{
    /* stuff for getopt(3): */
    extern int getopt();
    /* extern int optind */
    /* extern char *optarg; */
    int ch;			  /* For getopt(3) */
    int ErrorFlag = 0;		  /* For getopt(3) */
    t_LQTEXT_Database *db;   /* The database we'll read from */
    t_lqdbOptions *Options;  /* The user's configuration & preferences */

    int Widfd;
    caddr_t theData;
    struct stat statbuf;

    progname = argv[0];
	/* I see this as a library program, so I am leaving the full
	 * path.  lqaddfile(1L) and lqphrase(1L) set progname to be
	 * the filename of the command, rather than the full pathname.
	 */

    Options = LQT_InitFromArgv(argc, argv);
	/* Deal with any arguments that are understood by all lqtext
	 * programs.
	 */

    while ((ch = getopt(argc, argv, "VvxZz:")) != EOF) {
	switch (ch) {
	case 'V':
	    fprintf(stderr, "%s version %s\n", progname, Revision);
	    break;
	case 'x':
	    ErrorFlag++;
	    break;
	case '?':
	    ErrorFlag++;
	    break;
	case 'z':
	case 'Z':
	    break; /* done by LQT_InitFromArgv(); */
	}
    }

    if (ErrorFlag) {
	fprintf(stderr, "%s: sort words by WID\n", progname);
	LQT_PrintDefaultUsage(Options);
	/* LQT_PrintDefaultUsage() prints the list of the standard options. */
	exit(1);
    }

    db = LQT_OpenDatabase(Options, O_RDONLY, 0644);
    /* We are going to lie to the database and change its files without
     * telling it.  In particular, we're going to write all over the
     * word table, and then replace the dbm file that maps words to WIDS.
     * So AVOID ALL LQT_ functions....!  BE CAREFUL...!
     * We'll basically just use the db for filenames.
     * This program should go away.
     */

    Widfd = LQU_Eopen(
	E_FATAL|E_SYS,
	db->WidIndexFile,
	"WID (Word Identifier) Index file",
	O_RDWR,
	0 /* we are not creating it */
    );

    /* map the file into memory */

    {

	if (fstat(Widfd, &statbuf) < 0) {
	    Error(E_SYS|E_FATAL,
		"Couldn't fstat %s", db->WidIndexFile
	    );
	}

#ifndef MAP_FILE
# define MAP_FILE 0
#endif

	theData = mmap(
	    0,
	    statbuf.st_size,
	    PROT_READ|PROT_WRITE,
	    MAP_FILE|MAP_SHARED,
	    Widfd,
	    0
	);

	if (theData == (caddr_t) -1) {
	    Error(E_FATAL|E_SYS,
		"%s: mmap failed",
		db->WidIndexFile
	    );
	}

    }

    {
	unsigned long nblocks = (statbuf.st_size / WIDBLOCKSIZE);
    
	/* try to get rid of trailing empty blocks that
	 * were put there by the lqtext cache for efficiency
	 */
	register char *p;

	for (p = &theData[(nblocks - 1) * WIDBLOCKSIZE];
	     p >theData;
	     p -= WIDBLOCKSIZE
	) {
	    if (*p == 0) {
		--nblocks;
	    } else {
		break;
	    }
	}

	/* sort the WID file */
	qsort(
	    &theData[WIDBLOCKSIZE], /* first block is always empty */
	    nblocks - 1,
	    WIDBLOCKSIZE,
	    CompareWIDBlocks
	);
    }


    /* write out the results by unmapping it */

    if (munmap(theData, statbuf.st_size) < 0) {
	Error(E_FATAL|E_SYS,
	    "problem unmapping %lu bytes of memory for wid file",
	    statbuf.st_size
	);
    }

    (void) close(Widfd);


    /* make a new wid->word map */

    /* remove the old word index, if there was one */
    /* note: this is tricky, because we don't know the suffix... */
    {
	char tmp[8192];

	(void) sprintf(tmp, "cd \"%s\" && /bin/rm -f %s*",
	    db->DatabaseDirectory,
	    db->WordIndex
	);
	(void) system(tmp);
    }

    MakeNewWordMap(db);

    /* close the new wid->word map */

    LQT_CloseKeyValueDatabase(thedb);
    LQT_SyncAndCloseAllKeyValueDatabases(db);
    LQT_CloseDatabase(db);

    exit(0);
    /*NOTREACHED*/
    return(0); /* for lint and gcc -Wall*/
}

PRIVATE unsigned char *
GetEntry(Name, f, n)
    char *Name;
    int f; /* Unix file descriptor */
    t_WID n;
{
    static unsigned char Buf[WIDBLOCKSIZE * 1024];
    static unsigned long StartPos = 0L;
    static unsigned long EndPos = 0L;
    unsigned long Where = n * WIDBLOCKSIZE;

    if (Where + WIDBLOCKSIZE > EndPos + 1 || Where < StartPos) {
	int i;

	(void) LQU_Elseek(E_FATAL, Name, "Word Index File", f, Where, SEEK_SET);

	StartPos = Where;
	if ((i = read(f, (char *) Buf, sizeof Buf)) < 0) {
	    Error(E_FATAL|E_SYS,
		"WIDINDEX read(fd=%d, buf, n=%d) failed",
		f,
		sizeof Buf
	    );
	}
	EndPos = Where + i;
    }
    return &Buf[Where - StartPos];
}

PRIVATE int
PutWordIntoIndex(db, WordInfo)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
{
    unsigned char NumBuf[sizeof(t_WID) + 1];
    unsigned char *q = NumBuf;
    datum key, data;

    /** First, write the WID itself, so we can go from Word to WID */

    key.dptr = WordInfo->Word;
    key.dsize = WordInfo->Length;

    (void) LQT_sWriteNumber(&q, WordInfo->WID, NumBuf, sizeof NumBuf);

    data.dptr = (char *) NumBuf;
    data.dsize = q - NumBuf;

    /* contact database server */
    if (!thedb) {
	thedb = LQT_OpenKeyValueDatabase(db, db->WordIndex);
	if (thedb == (DBM *) 0) {
	    Error(E_FATAL|E_SYS,
		"Couldn't create new index \"%s\"",
		db->WordIndex
	    );
	}
    }

    return dbm_store(thedb, key, data, DBM_REPLACE);
}