/* wordrules.h -- Copyright 1989, 1993, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: wordrules.h,v 1.21 2001/05/31 03:50:13 liam Exp $ * */ #ifndef LQ_WORDRULES_H # define LQ_WORDRULES_H 1 /* Rules for determining what an indexable word looks like; * These are implemented by the various filters, as well as by * the indexing software itself. This means that the filters * don't need to keep track of word lengths, as addfile will do this, * but that they should not emit non-word stuff if they can help it, * turning it into the equivalent amount (in bytes) of white-space * instead. * They should also turn words they don't want indexed into "qxxx", * with the right number of x's (e.g. "bare" --> "qxxx"). */ /* A "word" is a letter followed by any combination of * letters, digits or '_'. An embedded (not trailing) ' is also allowed * (_ is allowed so that one can index progamming languages; strictly * speaking, a lot of languages allow _ at the start too, but I don't * want to get confused by nroff output etc., which contains lines of * underscores) * * LQT_OnlyWithinWord() is true for those characters that must be * surrounded by `letters', e.g. ' can appear in a word, but not ''. * * This scheme currently excludes numbers... * 31, 31.4 and 31.9e4 will all be ignored. So will 1987. * * #define LQT_StartsWord(db, ch) isalpha(ch) * #define LQT_OnlyWithinWord(db, ch) (ch == '\'') * #define LQT_EndsWord(db, ch) (isalnum(ch)||(ch == '_')) */ /* Note: don't use the LQT_ISALPHA (etc.) macros here, because those * macros are initialised from these ones... * After initialisation, these macros shouldn't be used, because they are * slower than the LQT_ENDSWORD (etc) ones. * See h/chartype.h for a little more information. */ # define LQT_StartsWord(db, ch) \ ( \ isalpha(ch) || \ ((ch) == LQT_CHAR_TO_IGNORE) || \ ( (db)->IndexNumbers && (isdigit(ch) || (ch) == LQT_DIGIT_TO_IGNORE)) \ ) # define LQT_EndsWord(db, ch) \ (isalnum(ch) || (ch) == '_' || (ch) == LQT_CHAR_TO_IGNORE || \ (ch) == LQT_DIGIT_TO_IGNORE) # define LQT_OnlyWithinWord(db, ch) (ch == '\'') /* If MinWordLength is not specified in a database config file, * this default is used. Words shorter thn this are truncated. * A default of 2 means that "at" would be indexed, but not "a". * in text files, a default of 1 indexes everything, but in things like * usenet articles or e-mail, you might not want that. * If lq-text skips a word that is too short, it remembers that fact, * and the next word that's indexed gets a flag set to say a letter was * skipped, so that the query "N.Y. Times" will only match occurrences * of "Times" that follow one or more letters that were ignored -- e.g. * it would also match X.Y.Z. Times. */ #ifndef LQC_DEFAULT_MinWordLength # define LQC_DEFAULT_MinWordLength 2 #endif #ifndef LQC_DEFAULT_MaxWordLength # define LQC_DEFAULT_MaxWordLength 18 /* truncate words to this length */ #endif /* Short-circuit trying to adduce the singular of a word. If you are * not indexing predominantly English text, or if you change the minimum * wordlength to 3 or greater both here and usually in the database README, * use * #define LQT_WORDROOT(WordInfo) LQT_ReduceWordToRoot(WordInfo) * instead of this: */ #define LQT_WORDROOT(db, WordInfo) \ ( \ ((WordInfo)->Length > 2 || \ (LQT_ISDIGIT(db, (WordInfo)->Word[0]) && db->ConvertNumbers)) ? \ LQT_ReduceWordToRoot((db), (WordInfo)) : (WordInfo)->Word \ ) #define LQT_FileBlockSize(db) ((db)->FileBlockSize) /* WordPlace Flags: * When a plural word is found, or a possessive word, it is reduced to * being singular, and flags are set appropriately. * Also, a flag is set to say if the word started with a Capital Letter. * This puts Window, windows, and Window's all together, but enables them * to be differentiated for searching if required. * These flags are implemented by WordInfo and addfile, not by the various * filters, but the filters must preserve capitalisation of the first letter * in each word, and pass through apostrophes within words (like this's). * * ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as * well as WPF_ALL below. */ #define WPF_WASPLURAL 0001 /* The word... ended in s */ #define WPF_UPPERCASE 0002 /* ...Started with a capital letter */ #define WPF_POSSESSIVE 0004 /* ...ended in 's */ #define WPF_NEXTHASPUNCT 0010 /* I'm the last word in this block */ #define WPF_NEXTISCOMMON 0020 /* I'm the last word in this block */ #define WPF_LASTHADLETTERS 0040 /* we skipped some letters to get here */ #define WPF_HASSTUFFBEFORE 0100 /* Other than 1 byte of garbage before */ #define WPF_LASTINBLOCK 0200 /* I'm the last word in this block */ /* ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as * well as WPF_ALL below. */ /* Flags that relate to the previous word in the input; these may be * stored in the StuffBefore byte if it's there. * If not, there is nowhere to put these, and they are omitted. * Note that these flags don't fit directly in a single byte. */ #define WPF_LASTHADPUNCT 01000 /* we skipped some punctuation */ #define WPF_LASTWASCOMMON 02000 /* the previous word was common */ /* ON UPDATING THIS LIST, see also liblqtext/wflags.c and h/globals.h, as * well as WPF_ALL below. */ #define WPF_ALL \ (WPF_WASPLURAL|WPF_UPPERCASE|WPF_POSSESSIVE| \ WPF_LASTHADPUNCT|WPF_LASTWASCOMMON|WPF_LASTHADLETTERS| \ WPF_HASSTUFFBEFORE|WPF_LASTINBLOCK| \ WPF_NEXTHASPUNCT|WPF_NEXTISCOMMON) /* Ways of getting at the flags: */ #define LQT_WPF_WASPLURAL(w) ((w)->Flags & WPF_WASPLURAL) #define LQT_WPF_UPPERCASE(w) ((w)->Flags & WPF_UPPERCASE) #define LQT_WPF_POSSESSIVE(w) ((w)->Flags & WPF_POSSESSIVE) #define LQT_WPF_NEXTHASPUNCT(w) ((w)->Flags & WPF_NEXTHASPUNCT) #define LQT_WPF_NEXTISCOMMON(w) ((w)->Flags & WPF_NEXTISCOMMON) #define LQT_WPF_LASTHADLETTERS(w) ((w)->Flags & WPF_LASTHADLETTERS) #define LQT_WPF_HASSTUFFBEFORE(w) ((w)->Flags & WPF_HASSTUFFBEFORE) #define LQT_WPF_LASTINBLOCK(w) ((w)->Flags & WPF_LASTINBLOCK) #define LQT_WPF_LASTHADPUNCT(w) ((w)->Flags & WPF_LASTHADPUNCT) #define LQT_WPF_LASTWASCOMMON(w) ((w)->Flags & WPF_LASTWASCOMMON) #define LQT_WPF_HAS_FLAGS(w) ((w)->Flags != 0) #define LQT_WPF_STUFF_BEFORE(w) ((w)->StuffBefore & 077) /* macro to squidge the LAST flags on top of StuffBefore */ #define LQTpCombineFlagsAndStuff(P) \ ( ( ((P)->Flags & 03000) >> 2 ) | ( LQT_WPF_STUFF_BEFORE(P) ) ) /* unsquidger: */ #define LQTpDisentangleFlagsAndStuff(P, Value) \ { \ (P)->Flags |= (((Value) << 2) & 03000); \ (P)->StuffBefore = ( (Value) & 0077); \ } /* Type guaranteed to hold a flag: */ typedef unsigned short t_WordFlags; #define LQT_GetFlagValue(Flag) (Flag) /* Structure to hold flag <--> string data structure. * This is used for printing/reading flags, and may later be used * for dynamically allocated flags. */ typedef struct { unsigned long Value; /* e.g. WPF_NEXTISCOMMON */ char *Name; /* e.g. "next is common" */ } t_FlagNamePair; LIBRARY t_FlagNamePair LQTp_WordFlagArray[]; #define LQTpWordFlagSep "," /* used in printable representation of flags */ /* Character to ignore in indexing */ #define LQT_DIGIT_TO_IGNORE (3) /* i.e., control-C cannot be indexed */ #define LQT_CHAR_TO_IGNORE (4) /* i.e., control-D cannot be indexed */ #define LQT_CHAR_TO_SKIP (8) /* \b, move word left one char, from filters */ #endif /* LQ_WORDRULES_H */