/* Root.c -- Copyright 1989, 1992-1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * Root.c is responsible for morphology -- word stemming, plurals, etc. * This file is due for dramatic changes with configurable stemmers. * * The current approach tries to be a 90% solution -- it gets the singluar * or most words right, and sometimes goes wrong reversing the operation. * * $Id: root.c,v 2.19 1996/08/20 18:13:18 lee Exp lee $ * */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include /* needed for filinfo.h */ #ifdef HAVE_STRING_H #include #else #include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "liblqtext.h" /** Unix system calls that need to be declared: **/ /* (none) */ /** C Library functions that nees to be declared: **/ #ifndef toupper extern int toupper( #ifdef HAVE_PROTO int theChar #endif ); #endif /** lqtext functions that need to be declared: **/ /** Functions from this file that need to be declared: **/ /** **/ /* * LQT_ReduceWordToRoot * Language/Stemming * * Reduces the word in the WordInfo pointed to by its argument to an * English root, by stripping plurals and possessives. * WordInfo->Length is modified as necessary, and WordInfo->Flags * are updated by or'ing any necessary items from wordrules.h. * The word can grow by up to two characters in length. * It is the caller's responsibility to allocate enough space. * You can also use the WORDROOT macro from wordrules.h, * which calls LQT_ReduceWordToRoot only if it might make a change. * * A pointer to WordInfo's Word * * This routine is only sensible for English. * * LQT_ReadWordFromStringPointer * */ API char * LQT_ReduceWordToRoot(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { unsigned char *Word = WordInfo->Word; register int length = WordInfo->Length; if (!Word) { Error(E_FATAL|E_BUG, "LQT_ReduceWordToRoot() called with null word" ); } if (!*Word) { Error(E_WARN|E_BUG, "LQT_ReduceWordToRoot() called with empty word" ); return Word; } Word[length] = '\0'; if (db->ConvertNumbers == 1 && LQT_ISDIGIT(db, (unsigned int) *Word)) { /* maybe it's a number... */ unsigned long n = 0; int base = 10; unsigned char *p = WordInfo->Word; #define ishex(c) \ (base == 16 && LQT_TOLOWER(db, c) >= 'a' && LQT_TOLOWER(db, c) <= 'f') if (*p == '0') { if (length == 1) return Word; if (Word[1] == 'x' || Word[1] == 'X') { /* 0xFF -- C notation for hexadecimal; * Note that we can't use 16#ff, since the # is * not a word character... we'd have to do this * in ReadWord instead. */ p = &Word[2]; if (!LQT_ISDIGIT(db, *p) && !(LQT_TOLOWER(db, *p) >= 'a' && LQT_TOLOWER(db, *p) <= 'f') ) { goto NotANumber; } base = 16; } else { base = 8; p++; if (!LQT_ISDIGIT(db, *p)) { goto NotANumber; } else { register char *q; for (q = p; *q; q++) { if (LQT_ISDIGIT(db, *q)) { if (*q > '7') { base = 10; break; } } else { break; } } } } } for (;;) { if (LQT_ISDIGIT(db, *p)) { n *= base; n += *p - '0'; } else if (ishex(*p)) { n *= base; n += 10 + LQT_TOLOWER(db, *p) - 'a'; } else { /* end of the number */ break; } p++; } /* if p == Word, it's not a number */ if (p > Word) { if (*p == '\0') { (void) sprintf(Word, "%ld", n); WordInfo->Length = strlen(Word); return Word; } else { char buffer[50]; unsigned int len; /* Not a number after all?? * or maybe a number and a suffix, as in * 6ft 4in */ /* if it was a hex number, it might have got longer. * In that case, if the result is too long, we * have to truncate properly */ (void) sprintf(buffer, "%ld", n); len = strlen(buffer); if (len != p - Word) { /* move the rest of the word into the right place. * Note that this assumes it fits!!!! */ bcopy(p, &Word[len], length - (p - Word)); } (void) memcpy(Word, buffer, len); /* fall through in case the suffix needs to bre reduced, * e.g. as in "100feet" or "077inches", which turns * into "63inch" taking the 077 as octal... */ length = WordInfo->Length = strlen(Word); } } } /* end of it's-a-number */ NotANumber: if (length <= 2) { if (Word[length - 1] == '\'') { WordInfo->Length -= 1; Word[WordInfo->Length] = '\0'; } return Word; } /* ASSERT: length > 2 */ /** delete trailing <'s> or s<'> and mark possessive */ if (Word[length - 1] == 's' && Word[length - 2] == '\'') { /* The boy's feet; John's hat */ WordInfo->WordPlace.Flags |= WPF_POSSESSIVE; length -= 2; Word[length] = '\0'; if (length <= 2) { WordInfo->Length = length; return Word; } } else if (Word[length - 1] == '\'') { /* the boys' feet; James' hat; * allowing a trailing quote after any letter lets us * match things like "havin'"; stripping the letter maps * "boy'" in "`this is my boy'" to "boy", which is what we want. */ if (Word[length - 2] == 's') { WordInfo->WordPlace.Flags |= WPF_POSSESSIVE; } length -= 1; Word[length] = '\0'; if (length <= 2) { WordInfo->Length = length; return Word; } } /** delete trailing plural suffix and mark plural */ /* It's important to realise that the purpose of this routine is not * in any way to reduce a word to an etymological root. In other words, * no attempt is made to differentiate between plurals and present * participles, or words that simply happen to end in `s'. * Hence, elephants, blunderbus, hostess, runs and tomatoes are all * candidates. Of course, one would like to do as well as one can! * Again, the object isn't to derive the correct singular, but instead * to be fairly fast, and, above all, to ensure that any transformations * are reversible! * * The result is that I can store dog and dogs in the same Wordinfo * chain. In the case that either word is unusual, there is a space * saving of (on average) 30 or so bytes. More usefully, if you ask * for `Window', I will automatically find `Windows' as well. * * so... * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es * except: pianos, dynamos, photos * XXCy --> XXCies [ C consonant] * XXVy --> XXVys [ V vowel ] * f or fe --> ves (12 cases only) * vowel change: * foot/feet (why bother with these? -- use a thesaurus!) * need to keep penny/pence separate * See Thomson & Martinet, section 8ff (I think) */ if (length <= 2) { return WordInfo->Word; } switch (Word[length - 1]) { case 's': /* Mark the word as plural, but then be prepared to unmark it later: */ WordInfo->WordPlace.Flags |= WPF_WASPLURAL; switch (Word[length - 2]) { case 'e': if (length >= 3) { switch (Word[length - 3]) { case 'i': /* xxcies --> xxxy */ /* flies -> fly * lies -> lie * dies -> die */ if (length > 4) { Word[length - 3] = 'y'; length -= 2; } else { /* ies not a plural, but lies is :-) */ length--; /* just the s */ } break; case 's': /* houses cases hisses */ if (length >= 5) { if (Word[length - 4] == 's') { /* hisses */ length -= 2; } else { length -= 1; /* keep the e */ } } else { /* uses --> use */ length -= 1; } break; case 'x': /* foxes, boxes */ length -= 2; break; case 'h': case 'o': /* xxxoes --> xxx */ if (length > 5) { length -= 2; } else { /* hoes -> hoe, toes -> toe, ches -> che?? */ length -= 1; } break; case 'v': /* selves, shelves */ if (length >= 5) { switch (Word[length - 4]) { case 'i': /* wives, lives, knives, connives */ switch (Word[length - 5]) { case 'w': case 'l': case 'n': length -= 1; Word[length - 2] = 'f'; break; default: length -= 1; break; } break; case 'l': length -= 2; Word[length - 1] = 'f'; break; case 'a': if (Word[length - 5] == 'e' && length >= 6 ) { if (Word[length - 6] == 'l') { /* leaves/leaf */ length -= 2; Word[length - 1] = 'f'; } else if ( Word[length - 6] == 'h' && length >= 7 && Word[length - 7] == 's' ) { /* sheaves, is this worth it?? */ length -= 2; Word[length - 1] = 'f'; } else { /* weaves/weave, etc */ length--; } } else { /* saves/save, etc */ length--; } break; default: /* e.g. delves/delve, groves/grove, grooves, etc */ length--; } } else { length -= 1; } break; default: /* xxxes -> xxxe */ length -= 1; break; } } else { /* too short */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; } break; case 'y': /* xxxvys --> xxxvy */ if (length >= 3) { switch (Word[length - 3]) { /* e.g. holidays */ case 'a': /* flays */ case 'e': /* beys */ case 'i': /* ??iys?? */ /* (doesn't occur in the S.O.D.) */ case 'o': /* boys */ case 'u': /* guys */ length--; /* just remove the s */ break; default: /* probably not a plural, e.g. gonys, dys, chlamys */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; } } else { /* too short to be a plural, i.e. "ys" */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; } break; case 's': /* trailing ss doesn't mark a plural! */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; break; case 'u': /* ONE bus, thus, omnibus; TWO gnus, TWO emus */ /* So it doesn't work for gnus and emus right now! */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; break; case 'i': /* not a plural.. this, his, fleur-de-lis */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; break; case 'a': /* has */ case 'o': /* cos */ if (length < 4) { WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; break; } /* else fall through */ default: /* just plain s */ length--; break; } Word[length] = '\0'; break; case 'i': switch (Word[length - 2]) { #ifndef MUST_NOT_GET_LONGER case 'm': /* happypotamus/-i */ /* mimi unchanged * ami, Ammi, demi, semi unchanged */ case 'n': /* terminus/termini * bikini, linguini, mini unchanged */ case 'i': /* radii -> radius * xviii -> xviii * mdii -> mdii (oops) * ascii unchanged * genii unchanged (*not* -> genius!) */ WordInfo->WordPlace.Flags |= WPF_WASPLURAL; Word[length - 1] = 'u'; Word[length] = 's'; Word[length + 1] = '\0'; length++; break; #endif default: /* not a plural, jut happens to end in an i */ WordInfo->WordPlace.Flags &= (unsigned char)~(unsigned char)WPF_WASPLURAL; } break; case 'n': /* men, women */ if (WordInfo->Word[length - 2] == 'e') { if (WordInfo->Word[length - 3] == 'm') { WordInfo->WordPlace.Flags |= WPF_WASPLURAL; Word[length - 2] = 'a'; } else if (WordInfo->Word[length - 3] == 'r') { if (length >= 8) { if ( STREQ(&WordInfo->Word[length - 8], "children") ) { WordInfo->WordPlace.Flags |= WPF_WASPLURAL; length -= 3; Word[length] = '\0'; } else if ( STREQ(&WordInfo->Word[length - 8], "brethren") ) { WordInfo->WordPlace.Flags |= WPF_WASPLURAL; length -= 1; /* brethren -> * brother * 7654321 */ Word[length - 5] = 'o'; Word[length - 2] = 'e'; Word[length - 1] = 'r'; Word[length] = '\0'; } } } } case 't': /* feet -> foot */ if (length >= 4 && WordInfo->Word[length - 2] == 'e' && WordInfo->Word[length - 3] == 'e' && WordInfo->Word[length - 4] == 'f' ) { WordInfo->WordPlace.Flags |= WPF_WASPLURAL; Word[length - 2] = 'o'; Word[length - 3] = 'o'; } #ifndef MUST_NOT_GET_LONGER case 'e': /* mice/mouse, lice/louse */ if (length >= 4 && WordInfo->Word[length - 1] == 'c' && WordInfo->Word[length - 2] == 'i' && ( WordInfo->Word[length - 3] == 'm' || WordInfo->Word[length - 3] == 'l' ) ) { WordInfo->Word[length - 2] = 'o'; WordInfo->Word[length - 1] = 'u'; WordInfo->Word[length - 0] = 's'; WordInfo->Word[length + 1] = 'e'; length++; } break; } #endif WordInfo->Length = length; return WordInfo->Word; } /* * LQT_GenerateWordFromRoot * Language/Stemming * * LQT_GenerateWordFromRoot tries to generate the original word from the * given flags. * Sometimes multiple plurals reduce to the same singular, * such as brothers and brethren both being forms of brother, and in * these cases the generated word may be incorrect. Other cases * include words ending in the letter o, which may or may not have * has an es stripped off, so that SunOS (the operating system) is * indexed as Suno, and incorrectly pluralised as Sunoes. * * A pointer to a static buffer * * Should allow per-database stemming options. * * LQT_ReduceWordToRoot, LQT_WordToWID, LQT_WIDToWord * */ API char * LQT_GenerateWordFromRoot(db, WordInfo, Flags) t_LQTEXT_Database *db; t_WordInfo *WordInfo; unsigned int Flags; { static char *Buffer = 0; register char *p, *q; int Length; if (!Buffer) { Buffer = emalloc("stemmer", db->MaxWordLength + 5); /* 's+es+\0 */ } if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) { Error(E_FATAL|E_BUG, "unFlag called with empty word [%s:%d]", __FILE__, __LINE__ ); } p = Buffer; q = WordInfo->Word; while ((*p++ = *q++) != '\0') { /*NULLBODY*/ } *p = '\0'; if ((Length = (p - Buffer) - 1) != WordInfo->Length) { /* Well, maybe I can't count */ Error(E_WARN|E_INTERNAL, "%s: %d: LQT_GenerateWordFromRoot(%*.*s, %d), length %d != %d", __FILE__, __LINE__, WordInfo->Length, WordInfo->Length, WordInfo->Word, Flags, WordInfo->Length, Length ); WordInfo->Length = Length = strlen(Buffer); } if (Flags & WPF_WASPLURAL) { if (Length >= 2) switch (Buffer[Length - 1]) { case 'y': if (Length > 1) switch (Buffer[Length - 2]) { case 'a': case 'e': case 'i': case 'o': case 'u': Buffer[Length++] = 's'; /* e.g. days */ break; default: strcpy(&Buffer[Length - 1], "ies"); /* ladies */ Length += 2; } break; case 'd': if (Length >= 5) { /* child -> chidren */ if (STREQ(&Buffer[Length - 5], "child")) { if (Length < db->MaxWordLength) { Buffer[Length++] = 'r'; if (Length < db->MaxWordLength) { Buffer[Length++] = 'e'; if (Length < db->MaxWordLength) { Buffer[Length++] = 'n'; } } } } } break; case 'n': /* man -> men, woman -> women */ if (Length >= 3 && Buffer[Length - 2] == 'a' && Buffer[Length - 3] == 'm' ) { Buffer[Length - 2] = 'e'; } break; case 's': if (Length > 2) { if (Buffer[Length - 2] == 'u' && Length >= 6) { strcpy(&Buffer[Length - 1], "ii"); /* Genii */ break; } } /* else fall through... */ case 'o': case 'h': case 'x': strcat(Buffer, "es"); Length += 2; break; case 't': if (Length >= 4 && !STRNCMP(&Buffer[Length - 4], "foot", 4)) { Buffer[Length - 2] = 'e'; Buffer[Length - 3] = 'e'; } break; /* TODO wolf/wolves, shelf, wife, mouse */ default: Buffer[Length++] = 's'; } Buffer[Length] = '\0'; } if (Flags & WPF_POSSESSIVE) { Buffer[Length++] = '\''; if (Buffer[Length - 2] != 's') { Buffer[Length++] = 's'; } Buffer[Length] = '\0'; } if (Flags & WPF_UPPERCASE) { Buffer[0] = toupper(Buffer[0]); } return Buffer; } /** **/