/* wordrule.c -- Copyright 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: wordrule.c,v 1.1 1996/05/14 23:45:12 lee Exp $ * * This file exists in order to allow you to customise the word rules in * a more complex way than you can do in wordrules.h if you need to. * It's *MUCH* better to use wordrules.h only if you can, as these functions * will get called literally millions of times in even a fairly small session * of indexing files. * */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include /* needed for filinfo.h */ #ifdef HAVE_STRING_H #include #else #include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "liblqtext.h" /** Unix system calls that need to be declared: **/ /** C Library functions that nees to be declared: **/ #ifndef toupper extern int toupper( #ifdef HAVE_PROTO int theChar #endif ); #endif /** lqtext functions that need to be declared: **/ /** Functions from this file that need to be declared: **/ /** **/ #ifndef LQT_StartsWord /* * LQT_StartsWord * Language/Stemming * * Returns non-zero only if the given character ch can appear at the * start of a word. This function is normally a macro declared in * the header file wordrules.h but can also be defined as a C * function is greater complexity is needed and the indexing speed loss * is not a concern. * * zero or non-zero. * * This routine is only sensible for English. * * LQT_EndsWord * LQT_OnlyWithinWord * */ API int LQT_StartsWord(db, ch) t_LQTEXT_Database *db; int ch; { Error(E_FATAL|E_INTERNAL, "LQT_StartsWord is being used uncustomised from file %s!", __FILE__ ); /* this is the default definition: */ return isalpha(ch) || (db->IndexNumbers && isdigit(ch)); } #endif #ifndef LQT_OnlyWithinWord /* * LQT_OnlyWithinWord * Language/Stemming * *

Returns non-zero only if the given character ch can appear within * a word but not at the start or end, and not repeated consecutively. * For English, an apostrophe (') is normally considered to be * the only such character; it's found in wouldn't, can't, and o'clock. * You could also include the hyphen if you * wanted, but it turns out to be best to index `match-box' as two * separate words with punctuation between them, rather than as * a single word.

*

This function is normally a macro declared in * the header file wordrules.h but can also be defined as a C * function is greater complexity is needed and the indexing speed loss * is not a concern.

* * zero or non-zero. * * This routine is only sensible for English. * * LQT_StartsWord *
*/ API int LQT_OnlyWithinWord(db, ch) t_LQTEXT_Database *db; int ch; { Error(E_FATAL|E_INTERNAL, "LQT_OnlyWithinWord is being used uncustomised from file %s!", __FILE__ ); /* this is the default definition: */ return (ch == '\''); } #endif #ifndef LQT_EndsWord /* * LQT_EndsWord * Language/Stemming * * Returns non-zero only if the given character ch can appear within or * at the end of a word. This function is normally a macro declared in * the header file wordrules.h but can also be defined as a C * function is greater complexity is needed and the indexing speed loss * is not a concern. * * zero or non-zero. * * This routine is only sensible for English. * * LQT_StartsWord * LQT_OnlyWithinWord * */ API int LQT_EndsWord(db, ch) t_LQTEXT_Database *db; int ch; { Error(E_FATAL|E_INTERNAL, "LQT_EndsWord is being used uncustomised from file %s!", __FILE__ ); /* this is the default definition: */ return (isalnum(ch) || ((ch) == '_')); } #endif