/* wordrules.h -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * $Id: wordrules.h,v 1.21 2001/05/31 03:50:13 liam Exp $
 *
 */

#ifndef LQ_WORDRULES_H
# define LQ_WORDRULES_H 1

/* Rules for determining what an indexable word looks like;
 * These are implemented by the various filters, as well as by
 * the indexing software itself.  This means that the filters
 * don't need to keep track of word lengths, as addfile will do this,
 * but that they should not emit non-word stuff if they can help it,
 * turning it into the equivalent amount (in bytes) of white-space
 * instead.
 * They should also turn words they don't want indexed into "qxxx",
 * with the right number of x's (e.g. "bare" --> "qxxx").
 */

/* A "word" is a letter followed by any combination of
 * letters, digits or '_'.  An embedded (not trailing) ' is also allowed
 * (_ is allowed so that one can index progamming languages; strictly
 * speaking, a lot of languages allow _ at the start too, but I don't
 * want to get confused by nroff output etc., which contains lines of
 * underscores)
 *
 * LQT_OnlyWithinWord() is true for those characters that must be
 * surrounded by `letters', e.g. ' can appear in a word, but not ''.
 *
 * This scheme currently excludes numbers...
 * 31, 31.4 and 31.9e4 will all be ignored.  So will 1987.
 * 
 * #define LQT_StartsWord(db, ch) isalpha(ch)
 * #define LQT_OnlyWithinWord(db, ch) (ch == '\'')
 * #define LQT_EndsWord(db, ch) (isalnum(ch)||(ch == '_'))
 */

/* Note: don't use the LQT_ISALPHA (etc.) macros here, because those
 * macros are initialised from these ones...
 * After initialisation, these macros shouldn't be used, because they are
 * slower than the LQT_ENDSWORD (etc) ones.
 * See h/chartype.h for a little more information.
 */
# define LQT_StartsWord(db, ch) \
    ( \
	isalpha(ch) || \
	((ch) == LQT_CHAR_TO_IGNORE) || \
	( (db)->IndexNumbers && (isdigit(ch) || (ch) == LQT_DIGIT_TO_IGNORE)) \
    )

# define LQT_EndsWord(db, ch) \
    (isalnum(ch) || (ch) == '_' || (ch) == LQT_CHAR_TO_IGNORE || \
    (ch) == LQT_DIGIT_TO_IGNORE)

# define LQT_OnlyWithinWord(db, ch) (ch == '\'')

/* If MinWordLength is not specified in a database config file,
 * this default is used.  Words shorter thn this are truncated.
 * A default of 2 means that "at" would be indexed, but not "a".
 * in text files, a default of 1 indexes everything, but in things like
 * usenet articles or e-mail, you might not want that.
 * If lq-text skips a word that is too short, it remembers that fact,
 * and the next word that's indexed gets a flag set to say a letter was
 * skipped, so that the query "N.Y. Times" will only match occurrences
 * of "Times" that follow one or more letters that were ignored -- e.g.
 * it would also match X.Y.Z. Times.
 */
#ifndef LQC_DEFAULT_MinWordLength
# define LQC_DEFAULT_MinWordLength 2
#endif

#ifndef LQC_DEFAULT_MaxWordLength
# define LQC_DEFAULT_MaxWordLength 18 /* truncate words to this length */
#endif

/* Short-circuit trying to adduce the singular of a word.  If you are
 * not indexing predominantly English text, or if you change the minimum
 * wordlength to 3 or greater both here and usually in the database README,
 * use
 * #define LQT_WORDROOT(WordInfo) LQT_ReduceWordToRoot(WordInfo)
 * instead of this:
 */
#define LQT_WORDROOT(db, WordInfo) \
    ( \
	((WordInfo)->Length > 2 || \
	    (LQT_ISDIGIT(db, (WordInfo)->Word[0]) && db->ConvertNumbers)) ?  \
	    LQT_ReduceWordToRoot((db), (WordInfo)) : (WordInfo)->Word \
    )

#define LQT_FileBlockSize(db) ((db)->FileBlockSize)

/* WordPlace Flags:
 * When a plural word is found, or a possessive word, it is reduced to
 * being singular, and flags are set appropriately.
 * Also, a flag is set to say if the word started with a Capital Letter.
 * This puts Window, windows, and Window's all together, but enables them
 * to be differentiated for searching if required.
 * These flags are implemented by WordInfo and addfile, not by the various
 * filters, but the filters must preserve capitalisation of the first letter
 * in each word, and pass through apostrophes within words (like this's).
 *
 * ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as
 * well as WPF_ALL below.
 */

#define WPF_WASPLURAL		0001 /* The word...  ended in s */
#define WPF_UPPERCASE		0002 /* ...Started with a capital letter */
#define WPF_POSSESSIVE		0004 /* ...ended in 's */
#define WPF_NEXTHASPUNCT	0010 /* I'm the last word in this block */
#define WPF_NEXTISCOMMON	0020 /* I'm the last word in this block */
#define WPF_LASTHADLETTERS	0040 /* we skipped some letters to get here */
#define WPF_HASSTUFFBEFORE	0100 /* Other than 1 byte of garbage before */
#define WPF_LASTINBLOCK		0200 /* I'm the last word in this block */
/* ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as
 * well as WPF_ALL below.
 */

/* Flags that relate to the previous word in the input; these may be
 * stored in the StuffBefore byte if it's there.
 * If not, there is nowhere to put these, and they are omitted.
 * Note that these flags don't fit directly in a single byte.
 */
#define WPF_LASTHADPUNCT	01000 /* we skipped some punctuation */
#define WPF_LASTWASCOMMON	02000 /* the previous word was common */
/* ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as
 * well as WPF_ALL below.
 */


#define WPF_ALL \
      (WPF_WASPLURAL|WPF_UPPERCASE|WPF_POSSESSIVE| \
       WPF_LASTHADPUNCT|WPF_LASTWASCOMMON|WPF_LASTHADLETTERS| \
      WPF_HASSTUFFBEFORE|WPF_LASTINBLOCK| \
      WPF_NEXTHASPUNCT|WPF_NEXTISCOMMON)

/* Ways of getting at the flags: */
#define LQT_WPF_WASPLURAL(w)		((w)->Flags & WPF_WASPLURAL)
#define LQT_WPF_UPPERCASE(w)		((w)->Flags & WPF_UPPERCASE)
#define LQT_WPF_POSSESSIVE(w)		((w)->Flags & WPF_POSSESSIVE)
#define LQT_WPF_NEXTHASPUNCT(w)		((w)->Flags & WPF_NEXTHASPUNCT)
#define LQT_WPF_NEXTISCOMMON(w)		((w)->Flags & WPF_NEXTISCOMMON)
#define LQT_WPF_LASTHADLETTERS(w)	((w)->Flags & WPF_LASTHADLETTERS)
#define LQT_WPF_HASSTUFFBEFORE(w)	((w)->Flags & WPF_HASSTUFFBEFORE)
#define LQT_WPF_LASTINBLOCK(w)		((w)->Flags & WPF_LASTINBLOCK)
#define LQT_WPF_LASTHADPUNCT(w)		((w)->Flags & WPF_LASTHADPUNCT)
#define LQT_WPF_LASTWASCOMMON(w)	((w)->Flags & WPF_LASTWASCOMMON)
#define LQT_WPF_HAS_FLAGS(w)		((w)->Flags != 0)

#define LQT_WPF_STUFF_BEFORE(w)		((w)->StuffBefore & 077)

/* macro to squidge the LAST flags on top of StuffBefore */
#define LQTpCombineFlagsAndStuff(P) \
    ( ( ((P)->Flags & 03000) >> 2 ) | ( LQT_WPF_STUFF_BEFORE(P) ) )

/* unsquidger: */
#define LQTpDisentangleFlagsAndStuff(P, Value) \
    { \
	(P)->Flags |= (((Value) << 2) & 03000); \
	(P)->StuffBefore = ( (Value) & 0077); \
    }

/* Type guaranteed to hold a flag: */
typedef unsigned short t_WordFlags;
#define LQT_GetFlagValue(Flag) (Flag)

/* Structure to hold flag <--> string data structure.
 * This is used for printing/reading flags, and may later be used
 * for dynamically allocated flags.
 */
typedef struct {
    unsigned long Value; /* e.g. WPF_NEXTISCOMMON */
    char *Name;		 /* e.g. "next is common" */
} t_FlagNamePair;

LIBRARY t_FlagNamePair LQTp_WordFlagArray[];

#define LQTpWordFlagSep "," /* used in printable representation of flags */

/* Character to ignore in indexing */
#define LQT_DIGIT_TO_IGNORE (3) /* i.e., control-C cannot be indexed */
#define LQT_CHAR_TO_IGNORE (4) /* i.e., control-D cannot be indexed */
#define LQT_CHAR_TO_SKIP (8) /* \b, move word left one char, from filters */

#endif /* LQ_WORDRULES_H */