/* wordrule.c -- Copyright 1995, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * $Id: wordrule.c,v 1.1 1996/05/14 23:45:12 lee Exp $
 *
 * This file exists in order to allow you to customise the word rules in
 * a more complex way than you can do in wordrules.h if you need to.
 * It's *MUCH* better to use wordrules.h only if you can, as these functions
 * will get called literally millions of times in even a fairly small session
 * of indexing files.
 *
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include <stdio.h>
#include <ctype.h>
#include <sys/types.h> /* needed for filinfo.h */
#ifdef HAVE_STRING_H
#include <string.h>
#else
#include <strings.h>
#endif

#include "fileinfo.h"
#include "wordinfo.h"
#include "wordrules.h"
#include "emalloc.h"

#include "liblqtext.h"

/** Unix system calls that need to be declared: **/
/** C Library functions that nees to be declared: **/

#ifndef toupper

extern int toupper(
#ifdef HAVE_PROTO
    int theChar
#endif
);

#endif

/** lqtext functions that need to be declared: **/
/** Functions from this file that need to be declared: **/
/** **/

#ifndef LQT_StartsWord
/* <Function>
 *   <Name>LQT_StartsWord
 *   <Class>Language/Stemming
 *   <Purpose>
 *      Returns non-zero only if the given character ch can appear at the
 *	start of a word.  This function is normally a macro declared in
 *	the header file <h>wordrules.h</h> but can also be defined as a C
 *	function is greater complexity is needed and the indexing speed loss
 *	is not a concern.
 *   <Returns>
 *      zero or non-zero.
 *   <Bugs>
 *	This routine is only sensible for English.
 *   <SeeAlso>
 *	LQT_EndsWord
 *	LQT_OnlyWithinWord
 * </Function>
 */
API int
LQT_StartsWord(db, ch)
    t_LQTEXT_Database *db;
    int ch;
{
    Error(E_FATAL|E_INTERNAL,
	"LQT_StartsWord is being used uncustomised from file %s!",
	__FILE__
    );

    /* this is the default definition: */
    return isalpha(ch) || (db->IndexNumbers && isdigit(ch));
}
#endif


#ifndef LQT_OnlyWithinWord
/* <Function>
 *   <Name>LQT_OnlyWithinWord
 *   <Class>Language/Stemming
 *   <Purpose>
 *      <P>Returns non-zero only if the given character ch can appear within
 *	a word but not at the start or end, and not repeated consecutively.
 *	For English, an apostrophe (') is normally considered to be
 *	the only such character; it's found in wouldn't, can't, and o'clock.
 *	You could also include the hyphen if you
 *	wanted, but it turns out to be best to index `match-box' as two
 *	separate words with punctuation between them, rather than as
 *	a single word.</P>
 *	<P>This function is normally a macro declared in
 *	the header file <h>wordrules.h</h> but can also be defined as a C
 *	function is greater complexity is needed and the indexing speed loss
 *	is not a concern.</P>
 *   <Returns>
 *      zero or non-zero.
 *   <Bugs>
 *	This routine is only sensible for English.
 *   <SeeAlso>
 *	LQT_StartsWord
 * </Function>
 */
API int
LQT_OnlyWithinWord(db, ch)
    t_LQTEXT_Database *db;
    int ch;
{
    Error(E_FATAL|E_INTERNAL,
	"LQT_OnlyWithinWord is being used uncustomised from file %s!",
	__FILE__
    );

    /* this is the default definition: */
    return (ch == '\'');
}
#endif


#ifndef LQT_EndsWord
/* <Function>
 *   <Name>LQT_EndsWord
 *   <Class>Language/Stemming
 *   <Purpose>
 *      Returns non-zero only if the given character ch can appear within or
 *	at the end of a word.  This function is normally a macro declared in
 *	the header file <h>wordrules.h</h> but can also be defined as a C
 *	function is greater complexity is needed and the indexing speed loss
 *	is not a concern.
 *   <Returns>
 *      zero or non-zero.
 *   <Bugs>
 *	This routine is only sensible for English.
 *   <SeeAlso>
 *	LQT_StartsWord
 *	LQT_OnlyWithinWord
 * </Function>
 */
API int
LQT_EndsWord(db, ch)
    t_LQTEXT_Database *db;
    int ch;
{
    Error(E_FATAL|E_INTERNAL,
	"LQT_EndsWord is being used uncustomised from file %s!",
	__FILE__
    );

    /* this is the default definition: */
    return (isalnum(ch) || ((ch) == '_'));

}
#endif