/* Root.c -- Copyright 1989, 1992-1995, 1996 Liam R. E. Quin.
 * All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * Root.c is responsible for morphology -- word stemming, plurals, etc.
 * This file is due for dramatic changes with configurable stemmers.
 *
 * The current approach tries to be a 90% solution -- it gets the singluar
 * or most words right, and sometimes goes wrong reversing the operation.
 *
 * $Id: root.c,v 2.19 1996/08/20 18:13:18 lee Exp lee $
 *
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include <stdio.h>
#include <ctype.h>
#include <sys/types.h> /* needed for filinfo.h */
#ifdef HAVE_STRING_H
#include <string.h>
#else
#include <strings.h>
#endif

#include "fileinfo.h"
#include "wordinfo.h"
#include "wordrules.h"
#include "emalloc.h"

#include "liblqtext.h"

/** Unix system calls that need to be declared: **/
    /* (none) */
/** C Library functions that nees to be declared: **/

#ifndef toupper

extern int toupper(
#ifdef HAVE_PROTO
    int theChar
#endif
);

#endif

/** lqtext functions that need to be declared: **/

/** Functions from this file that need to be declared: **/
/** **/

/* <Function>
 *   <Name>LQT_ReduceWordToRoot
 *   <Class>Language/Stemming
 *   <Purpose>
 *      Reduces the word in the WordInfo pointed to by its argument to an
 *	English root, by stripping plurals and possessives.
 *	WordInfo->Length is modified as necessary, and WordInfo->Flags
 *	are updated by or'ing any necessary items from <h>wordrules.h</h>.
 *      The word can grow by up to two characters in length.
 *      It is the caller's responsibility to allocate enough space.
 * 	You can also use the WORDROOT macro from <h>wordrules.h</h>,
 *	which calls LQT_ReduceWordToRoot only if it might make a change.
 *   <Returns>
 *      A pointer to WordInfo's Word
 *   <Bugs>
 *	This routine is only sensible for English.
 *   <SeeAlso>
 *	LQT_ReadWordFromStringPointer
 * </Function>
 */
API char *
LQT_ReduceWordToRoot(db, WordInfo)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
{
    unsigned char *Word = WordInfo->Word;
    register int length = WordInfo->Length;

    if (!Word) {
	Error(E_FATAL|E_BUG,
	    "LQT_ReduceWordToRoot() called with null word"
	);
    }

    if (!*Word) {
	Error(E_WARN|E_BUG,
	    "LQT_ReduceWordToRoot() called with empty word"
	);
	return Word;
    }

    Word[length] = '\0';

    if (db->ConvertNumbers == 1 && LQT_ISDIGIT(db, (unsigned int) *Word)) {
	/* maybe it's a number... */
	unsigned long n = 0;
	int base = 10;
	unsigned char *p = WordInfo->Word;

#define ishex(c) \
    (base == 16 && LQT_TOLOWER(db, c) >= 'a' && LQT_TOLOWER(db, c) <= 'f')

	if (*p == '0') {
	    if (length == 1) return Word;
	    if (Word[1] == 'x' || Word[1] == 'X') {
		/* 0xFF -- C notation for hexadecimal;
		 * Note that we can't use 16#ff, since the # is
		 * not a word character...  we'd have to do this
		 * in ReadWord instead.
		 */
		p = &Word[2];
		if (!LQT_ISDIGIT(db, *p) && 
		    !(LQT_TOLOWER(db, *p) >= 'a' && LQT_TOLOWER(db, *p) <= 'f')
		) {
		    goto NotANumber;
		}
		base = 16;
	    } else {
		base = 8;
		p++;
		if (!LQT_ISDIGIT(db, *p)) {
		    goto NotANumber;
		} else {
		    register char *q;

		    for (q = p; *q; q++) {
			if (LQT_ISDIGIT(db, *q)) {
			    if (*q > '7') {
				base = 10;
				break;
			    }
			} else {
			    break;
			}
		    }
		}
	    }
	}

	for (;;) {
	    if (LQT_ISDIGIT(db, *p)) {
		n *= base;
		n += *p - '0';
	    } else if (ishex(*p)) {
		n *= base;
		n += 10 + LQT_TOLOWER(db, *p) - 'a';
	    } else {
		/* end of the number */
		break;
	    }
	    p++;
	}

	/* if p == Word, it's not a number */

	if (p > Word) {
	    if (*p == '\0') {
		(void) sprintf(Word, "%ld", n);
		WordInfo->Length = strlen(Word);
		return Word;
	    } else {
		char buffer[50];
		unsigned int len;

		/* Not a number after all??
		 * or maybe a number and a suffix, as in
		 * 6ft 4in
		 */

		/* if it was a hex number, it might have got longer.
		 * In that case, if the result is too long, we
		 * have to truncate properly
		 */

		(void) sprintf(buffer, "%ld", n);
		len = strlen(buffer);

		if (len != p - Word) {
		    /* move the rest of the word into the right place.
		     * Note that this assumes it fits!!!!
		     */
		    bcopy(p, &Word[len], length - (p - Word));
		}
		(void) memcpy(Word, buffer, len);
		/* fall through in case the suffix needs to bre reduced,
		 * e.g. as in "100feet" or "077inches", which turns
		 * into "63inch" taking the 077 as octal...
		 */
		length = WordInfo->Length = strlen(Word);
	    }
	}
    } /* end of it's-a-number */

NotANumber:

    if (length <= 2) {
	if (Word[length - 1] == '\'') {
	    WordInfo->Length -= 1;
	    Word[WordInfo->Length] = '\0';
	}
	return Word;
    }

    /* ASSERT: length > 2 */

    /** delete trailing <'s> or s<'> and mark possessive */
    if (Word[length - 1] == 's' && Word[length - 2] == '\'') {
	/* The boy's feet; John's hat */
	WordInfo->WordPlace.Flags |= WPF_POSSESSIVE;
	length -= 2;
	Word[length] = '\0';
	if (length <= 2) {
	    WordInfo->Length = length;
	    return Word;
	}
    } else if (Word[length - 1] == '\'') {
	/* the boys' feet; James' hat;
	 * allowing a trailing quote after any letter lets us
	 * match things like "havin'"; stripping the letter maps
	 * "boy'" in "`this is my boy'" to "boy", which is what we want.
	 */
	if (Word[length - 2] == 's') {
	    WordInfo->WordPlace.Flags |= WPF_POSSESSIVE;
	}
	length -= 1;
	Word[length] = '\0';

	if (length <= 2) {
	    WordInfo->Length = length;
	    return Word;
	}
    }

    /** delete trailing plural suffix and mark plural */

    /* It's important to realise that the purpose of this routine is not
     * in any way to reduce a word to an etymological root.  In other words,
     * no attempt is made to differentiate between plurals and present
     * participles, or words that simply happen to end in `s'.
     * Hence, elephants, blunderbus, hostess, runs and tomatoes are all
     * candidates.  Of course, one would like to do as well as one can!
     * Again, the object isn't to derive the correct singular, but instead
     * to be fairly fast, and, above all, to ensure that any transformations
     * are reversible!
     *
     * The result is that I can store dog and dogs in the same Wordinfo
     * chain.  In the case that either word is unusual, there is a space
     * saving of (on average) 30 or so bytes.  More usefully, if you ask
     * for `Window', I will automatically find `Windows' as well.
     *
     * so...
     * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es
     * 	except: pianos, dynamos, photos
     * XXCy --> XXCies [ C consonant]
     * XXVy --> XXVys [ V vowel ]
     * f or fe --> ves (12 cases only)
     * vowel change:
     * foot/feet (why bother with these? -- use a thesaurus!)
     * need to keep penny/pence separate
     * See Thomson & Martinet, section 8ff (I think)
     */

    if (length <= 2) {
	return WordInfo->Word;
    }

    switch (Word[length - 1]) {
    case 's':
	/* Mark the word as plural, but then be prepared to unmark it later: */
	WordInfo->WordPlace.Flags |= WPF_WASPLURAL;

	switch (Word[length - 2]) {
	case 'e':
	    if (length >= 3) {
		switch (Word[length - 3]) {
		case 'i': /* xxcies --> xxxy */
		    /* flies -> fly
		     * lies -> lie
		     * dies -> die
		     */
		    if (length > 4) {
			Word[length - 3] = 'y';
			length -= 2;
		    } else { /* ies not a plural, but lies is :-) */
			length--; /* just the s */
		    }
		    break;
		case 's':
		    /* houses cases hisses */
		    if (length >= 5) {
			if (Word[length - 4] == 's') {
			    /* hisses */
			    length -= 2;
			} else {
			    length -= 1; /* keep the e */
			}
		    } else {
			/* uses --> use */
			length -= 1;
		    }
		    break;
		case 'x': /* foxes, boxes */
		    length -= 2;
		    break;
		case 'h':
		case 'o': /* xxxoes --> xxx */
		    if (length > 5) {
			length -= 2;
		    } else {
			/* hoes -> hoe, toes -> toe, ches -> che?? */
			length -= 1;
		    }
		    break;
		case 'v':
		    /* selves, shelves */
		    if (length >= 5) {
			switch (Word[length - 4]) {
			case 'i': /* wives, lives, knives, connives */
			    switch (Word[length - 5]) {
			    case 'w': case 'l': case 'n':
				length -= 1;
				Word[length - 2] = 'f';
				break;
			    default:
				length -= 1;
				break;
			    }
			    break;
			case 'l':
			    length -= 2;
			    Word[length - 1] = 'f';
			    break;
			case 'a':
			    if (Word[length - 5] == 'e' &&
				length >= 6
			    ) {
				if (Word[length - 6] == 'l') {
				    /* leaves/leaf */
				    length -= 2;
				    Word[length - 1] = 'f';
				} else if (
				    Word[length - 6] == 'h' &&
				    length >= 7 &&
				    Word[length - 7] == 's'
				) {
				    /* sheaves, is this worth it?? */
				    length -= 2;
				    Word[length - 1] = 'f';
				} else {
				    /* weaves/weave, etc */
				    length--;
				}
			    } else {
				/* saves/save, etc */
				length--;
			    }
			    break;
			default:
			    /* e.g. delves/delve, groves/grove, grooves, etc */
			    length--;
			}
		    } else {
			length -= 1;
		    }
		    break;
		default: /* xxxes -> xxxe */
		    length -= 1;
		    break;
		}
	    } else { /* too short */
		WordInfo->WordPlace.Flags &=
			(unsigned char)~(unsigned char)WPF_WASPLURAL;
	    }
	    break;
	case 'y': /* xxxvys --> xxxvy */
	    if (length >= 3) {
		switch (Word[length - 3]) { /* e.g. holidays */
		case 'a': /* flays */
		case 'e': /* beys */
		case 'i': /* ??iys?? */ /* (doesn't occur in the S.O.D.) */
		case 'o': /* boys */
		case 'u': /* guys */
		    length--; /* just remove the s */
		    break;
		default: /* probably not a plural, e.g. gonys, dys, chlamys */
		    WordInfo->WordPlace.Flags &=
			    (unsigned char)~(unsigned char)WPF_WASPLURAL;
		}
	    } else { /* too short to be a plural, i.e. "ys" */
		WordInfo->WordPlace.Flags &=
			(unsigned char)~(unsigned char)WPF_WASPLURAL;
	    }
	    break;
	case 's': /* trailing ss doesn't mark a plural! */
	    WordInfo->WordPlace.Flags &=
			    (unsigned char)~(unsigned char)WPF_WASPLURAL;
	    break;
	case 'u':
	    /* ONE bus, thus, omnibus; TWO gnus, TWO emus */
	    /* So it doesn't work for gnus and emus right now! */
	    WordInfo->WordPlace.Flags &=
		    (unsigned char)~(unsigned char)WPF_WASPLURAL;
	    break;
	case 'i': /* not a plural.. this, his, fleur-de-lis */
	    WordInfo->WordPlace.Flags &=
		    (unsigned char)~(unsigned char)WPF_WASPLURAL;
	    break;
	case 'a': /* has */
	case 'o': /* cos */
	    if (length < 4) {
		WordInfo->WordPlace.Flags &=
			(unsigned char)~(unsigned char)WPF_WASPLURAL;
		break;
	    }
	    /* else fall through */
	default: /* just plain s */
	    length--;
	    break;
	}
	Word[length] = '\0';
	break;
    case 'i':
	switch (Word[length - 2]) {
#ifndef MUST_NOT_GET_LONGER
	case 'm': /* happypotamus/-i */
	    /* mimi unchanged
	     * ami, Ammi, demi, semi unchanged
	     */
	case 'n':
	    /* terminus/termini
	     * bikini, linguini, mini unchanged
	     */
	case 'i':
	    /* radii -> radius
	     * xviii -> xviii
	     * mdii -> mdii (oops)
	     * ascii unchanged
	     * genii unchanged (*not* -> genius!)
	     */
	    WordInfo->WordPlace.Flags |= WPF_WASPLURAL;
	    Word[length - 1] = 'u';
	    Word[length] = 's';
	    Word[length + 1] = '\0';
	    length++;
	    break;
#endif
	default:
	    /* not a plural, jut happens to end in an i */
	    WordInfo->WordPlace.Flags &=
		    (unsigned char)~(unsigned char)WPF_WASPLURAL;
	}
	break;
    case 'n': /* men, women */
	if (WordInfo->Word[length - 2] == 'e') {
	    if (WordInfo->Word[length - 3] == 'm') {
		WordInfo->WordPlace.Flags |= WPF_WASPLURAL;
		Word[length - 2] = 'a';
	    } else if (WordInfo->Word[length - 3] == 'r') {
		if (length >= 8) {
		    if (
			STREQ(&WordInfo->Word[length - 8], "children")
		    ) {
			WordInfo->WordPlace.Flags |= WPF_WASPLURAL;
			length -= 3;
			Word[length] = '\0';
		    } else if (
			STREQ(&WordInfo->Word[length - 8], "brethren")
		    ) {
			WordInfo->WordPlace.Flags |= WPF_WASPLURAL;
			length -= 1;
			    /* brethren ->
			     * brother
			     * 7654321
			     */
			Word[length - 5] = 'o';
			Word[length - 2] = 'e';
			Word[length - 1] = 'r';
			Word[length] = '\0';
		    }
		}
	    }
	}
    case 't':
	/* feet -> foot */
	if (length >= 4 &&
	    WordInfo->Word[length - 2] == 'e' &&
	    WordInfo->Word[length - 3] == 'e' &&
	    WordInfo->Word[length - 4] == 'f'
	) {
	    WordInfo->WordPlace.Flags |= WPF_WASPLURAL;
	    Word[length - 2] = 'o';
	    Word[length - 3] = 'o';
	}
#ifndef MUST_NOT_GET_LONGER
    case 'e':
	/* mice/mouse, lice/louse */
	if (length >= 4 &&
	    WordInfo->Word[length - 1] == 'c' &&
	    WordInfo->Word[length - 2] == 'i' &&
	    (
		WordInfo->Word[length - 3] == 'm' ||
		WordInfo->Word[length - 3] == 'l'
	    )
	) {
	    WordInfo->Word[length - 2] = 'o';
	    WordInfo->Word[length - 1] = 'u';
	    WordInfo->Word[length - 0] = 's';
	    WordInfo->Word[length + 1] = 'e';
	    length++;
	}
	break;
    }
#endif
    WordInfo->Length = length;
    return WordInfo->Word;
}

/* <Function>
 *   <Name>LQT_GenerateWordFromRoot
 *   <Class>Language/Stemming
 *   <Purpose>
 *	LQT_GenerateWordFromRoot tries to generate the original word from the
 *	given flags.
 *	Sometimes multiple plurals reduce to the same singular,
 *	such as brothers and brethren both being forms of brother, and in
 *	these cases the generated word may be incorrect.  Other cases
 *	include words ending in the letter o, which may or may not have
 *	has an es stripped off, so that SunOS (the operating system) is
 *	indexed as Suno, and incorrectly pluralised as Sunoes.
 *   <Returns>
 *      A pointer to a static buffer
 *   <Bugs>
 *	Should allow per-database stemming options.
 *   <SeeAlso>
 *	LQT_ReduceWordToRoot, LQT_WordToWID, LQT_WIDToWord
 * </Function>
 */
API char *
LQT_GenerateWordFromRoot(db, WordInfo, Flags)
    t_LQTEXT_Database *db;
    t_WordInfo *WordInfo;
    unsigned int Flags;
{
    static char *Buffer = 0;
    register char *p, *q;
    int Length;
    
    if (!Buffer) {
	Buffer = emalloc("stemmer", db->MaxWordLength + 5); /* 's+es+\0 */
    }

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	Error(E_FATAL|E_BUG,
	    "unFlag called with empty word [%s:%d]",
	    __FILE__, __LINE__
	);
    }

    p = Buffer;
    q = WordInfo->Word;
    while ((*p++ = *q++) != '\0') {
	/*NULLBODY*/
    }
    *p = '\0';
    
    if ((Length = (p - Buffer) - 1) != WordInfo->Length) {
	/* Well, maybe I can't count */
	Error(E_WARN|E_INTERNAL,
	    "%s: %d: LQT_GenerateWordFromRoot(%*.*s, %d), length %d != %d",
	    __FILE__, __LINE__,
	    WordInfo->Length,
	    WordInfo->Length,
	    WordInfo->Word,
	    Flags,
	    WordInfo->Length,
	    Length
	);
	WordInfo->Length = Length = strlen(Buffer);
    }

    if (Flags & WPF_WASPLURAL) {
	if (Length >= 2) switch (Buffer[Length - 1]) {
	case 'y':
	    if (Length > 1) switch (Buffer[Length - 2]) {
	    case 'a':
	    case 'e':
	    case 'i':
	    case 'o':
	    case 'u':
		Buffer[Length++] = 's'; /* e.g. days */
		break;
	    default:
		strcpy(&Buffer[Length - 1], "ies"); /* ladies */
		Length += 2;
	    }
	    break;
	case 'd':
	    if (Length >= 5) {
		/* child -> chidren */
		if (STREQ(&Buffer[Length - 5], "child")) {
		    if (Length < db->MaxWordLength) {
			Buffer[Length++] = 'r';
			if (Length < db->MaxWordLength) {
			    Buffer[Length++] = 'e';
			    if (Length < db->MaxWordLength) {
				Buffer[Length++] = 'n';
			    }
			}
		    }
		}
	    }
	    break;
	case 'n':
	    /* man -> men, woman -> women */
	    if (Length >= 3 &&
		Buffer[Length - 2] == 'a' &&
		Buffer[Length - 3] == 'm'
	    ) {
		Buffer[Length - 2] = 'e';
	    }
	    break;
	case 's':
	    if (Length > 2) {
		if (Buffer[Length - 2] == 'u' && Length >= 6) {
		    strcpy(&Buffer[Length - 1], "ii"); /* Genii */
		    break;
		}
	    } /* else fall through... */
	case 'o':
	case 'h':
	case 'x':
	    strcat(Buffer, "es");
	    Length += 2;
	    break;
	case 't':
	    if (Length >= 4 && !STRNCMP(&Buffer[Length - 4], "foot", 4)) {
		Buffer[Length - 2] = 'e';
		Buffer[Length - 3] = 'e';
	    }
	    break;
	/* TODO wolf/wolves, shelf, wife, mouse */
	default:
	    Buffer[Length++] = 's';
	}
	Buffer[Length] = '\0';
    }

    if (Flags & WPF_POSSESSIVE) {
	Buffer[Length++] = '\'';
	if (Buffer[Length - 2] != 's') {
	    Buffer[Length++] = 's';
	}
	Buffer[Length] = '\0';
    }

    if (Flags & WPF_UPPERCASE) {
	Buffer[0] = toupper(Buffer[0]);
    }

    return Buffer;
}

/** **/