/* Phrase.c -- Copyright 1989, 1994, 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* * Deal with (WID, FID, Offfset) triples * Liam Quin, September 1989 * * $Id: phrall.c,v 1.4 2001/05/31 03:50:13 liam Exp $ * */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #include /* stderr, also for fileinfo.h */ #include #include #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "pblock.h" #include "phrase.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" /** Unix system calls that need to be declared: **/ /** Unix/C Library Functions: **/ /** lqtext functions: **/ /** functions within this file that need forward declarations **/ /** **/ /* * LQT_AllPhrasesOfLengthNOrMore * Retrieval/Matching, Retrieval/Phrases * *

Finds all sequences of N or more words which occur in the data. * For example, given the phrase `the barefooted boy was very slender', * and supposing `the' to be the only word for which LQT_WordIsInStopList * returns true, LQT_AllPhrasesOfLengthNOrMore might find * `barefooted boy' and `boy was very' and `very slender' as * sub-phrases that occur; if the entire phrase occurs, it will * be returned.

*

If a phrase of M words matches, all phrases of lengths from N to * M inclusive will also be returned.

*

It is the caller's responsibility to deallocate the returned array * and its elements.

* * an array of t_PhraseElement structures, and the number of distinct * phrases found in *Countp. * * This function is experimental. It has not been optimised, and * is currently unusable for long phrases as a result. *
*/ API t_PhraseElement * LQT_AllPhrasesOfLengthNOrMore(db, N, theQuery, Countp) t_LQTEXT_Database *db; int N; char *theQuery; long *Countp; { long AllocatedElements = 0; t_PhraseElement *Result = 0; char **WordStarts = 0; int WordsInPhrase = 0; int WordsAllocated = 0; int i; WordsAllocated = 30; /* a guess */ WordStarts = (char **) emalloc( "LQT_AllPhrasesOfLengthNOrMore:WordStarts", WordsAllocated * sizeof(char *) ); /* reset ReadWord: */ (void) LQT_ReadWordFromStringPointer(db, 0, 0, 0, 0); /* read the phrase a word at a time, and determine * where the words start and end: */ { t_WordInfo *W; char *Start; char *Phrase; for (Phrase = theQuery; *Phrase; ) { W = LQT_ReadWordFromStringPointer( db, &Phrase, &Start, 0, LQT_READWORD_IGNORE_COMMON ); if (!W) { break; } if (WordsInPhrase + 1 > WordsAllocated) { WordsAllocated += 30; WordStarts = (char **) erealloc( (char *) WordStarts, WordsAllocated * sizeof(char *) ); } WordStarts[WordsInPhrase] = Start; ++WordsInPhrase; } } *Countp = 0L; if (WordsInPhrase < N) { LQT_Trace(LQTRACE_DEBUG|LQTRACE_MAKE_PHRASE, "Phrase [%s] length %d, hence contains no phrases of length %d", theQuery, WordsInPhrase, N ); return (t_PhraseElement *) 0; } /* for each word in the phrase */ for (i = 0; i < WordsInPhrase - N; i++) { /* for each following word */ int endWord; long matchCount; char *tmpPhrase = emalloc( "copy of subset of phrase", strlen(theQuery) + 1 ); for (endWord = i + N; endWord < WordsInPhrase; endWord++) { t_Phrase *P; if (endWord + 1 >= WordsInPhrase) { (void) strcpy(tmpPhrase, WordStarts[i]); } else { (void) strncpy( tmpPhrase, WordStarts[i], WordStarts[endWord + 1] - WordStarts[i] ); tmpPhrase[WordStarts[endWord + 1] - WordStarts[i]] = '\0'; } /* Match the phrase, but discard matches which start at the * same place as a match we already have for this phrase. * E.g. if we have * `how beautiful are the feet of them that stand on the' * for a given document, we don't also want * how beautiful * how beautiful are * how beautiful are the * how beautiful are the feet * beautiful are * etc. for the same file. */ P = LQT_StringToPhrase(db, tmpPhrase); matchCount = LQT_MakeMatches(db, P); /* is there a match? */ if (!matchCount) { /* no -> we can give up */ LQT_DestroyPhrase(db, P); break; } else { /* yes -> allocate a new t_PhraseElement for it, * and bump count */ if (!Result) { AllocatedElements = 20; Result = (t_PhraseElement *) emalloc( "One more Phrase Element", AllocatedElements * sizeof(t_PhraseElement) ); } else if (*Countp + 2 >= AllocatedElements) { AllocatedElements += 100; Result = (t_PhraseElement *) erealloc( (char *) Result, AllocatedElements * sizeof(t_PhraseElement) ); } Result[*Countp].PhraseStart = WordStarts[i]; Result[*Countp].PhraseEnd = &(Result[*Countp].PhraseStart)[strlen(tmpPhrase)]; Result[*Countp].Phrase = P; /* printf("MATCH: %d words, %d bytes, %*.*s\n", * LQT_NumberOfWordsInPhrase(P), * Result[*Countp].PhraseEnd - Result[*Countp].PhraseStart, * Result[*Countp].PhraseEnd - Result[*Countp].PhraseStart, * Result[*Countp].PhraseEnd - Result[*Countp].PhraseStart, * Result[*Countp].PhraseStart * ); */ ++*Countp; } } } /* for each word in the phrase */ return Result; }