/* phstring.c -- Copyright 1989, 1994-1996 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* phstring.c -- defines LQT_StringToPhrase() * $Id: phstring.c,v 1.10 2001/05/31 03:50:13 liam Exp $ * */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #ifdef HAVE_STDLIB_H # include #else # include #endif #include /* stderr, also for fileinfo.h */ #include #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #include #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "pblock.h" #include "phrase.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" /** Unix system calls that need to be declared: **/ /** Unix/C Library Functions: **/ /** lqtext functions: **/ /** functions within this file that need forward declarations **/ /** **/ /* * LQT_PhraseToString * Retrieval/Phrases * *

Returns a string representation of a phrase.

. *

This can be used for tracing, or to give users feedback about * how a phrase query was interpreted.

* * a pointer to a freshly malloc'd string, which the caller should free. * * LQT_StringToPhrase *
*/ API char * LQT_PhraseToString(db, Phrase) t_LQTEXT_Database *db; t_Phrase *Phrase; { char *Result; unsigned int Length; t_PhraseItem *W; char *p; for (Length = 0, W = Phrase->Words; W; W = W->Next) { if (LQT_WPF_HASSTUFFBEFORE(&W->Word->WordPlace)) { Length += W->Word->WordPlace.StuffBefore; } else { Length++; } Length += strlen(LQT_GenerateWordFromRoot( db, W->Word, W->Word->WordPlace.Flags )); if (!W->Next) { if (LQT_WPF_NEXTHASPUNCT(&W->Word->WordPlace)) { Length++; /* trailing punctuation */ } if (LQT_WPF_NEXTISCOMMON(&W->Word->WordPlace)) { Length += 4; /* we'll append a trailing " xxx" */ } } } Length++; /* for the nul */ p = Result = emalloc("LQT_PhraseToString", Length); for (W = Phrase->Words; W; W = W->Next) { int i; char *theWord; i = W->Word->WordPlace.StuffBefore; if (LQT_WPF_LASTHADPUNCT(&W->Word->WordPlace)) { *p++ = '.'; if (!i) { i = 1; } else { i--; } } else if (p > Result) { *p++ = ' '; if (i > 0) { --i; } } if (i > 1 && LQT_WPF_LASTWASCOMMON(&W->Word->WordPlace)) { while (i > i) { *p++ = 'x'; --i; } } while (i-- > 0) { *p++ = ' '; } theWord = LQT_GenerateWordFromRoot( db, W->Word, W->Word->WordPlace.Flags ); i = strlen(theWord); (void) strcat(p, theWord); p = &p[i]; if (!W->Next) { if (LQT_WPF_NEXTHASPUNCT(&W->Word->WordPlace)) { *p++ = '.'; } if (LQT_WPF_NEXTISCOMMON(&W->Word->WordPlace)) { *p++ = ' '; *p++ = 'x'; *p++ = 'x'; *p++ = 'x'; } } } *p = '\0'; return Result; } /* * LQT_StringToPhrase * Retrieval/Phrases * *

Creates a data structure representing the natural language phrase * contained in the given String.

*

Words in the phrase that could not possibly be in the index are not * included in the structure. This could be because they are * in the stop list or are too short, or because the IndexNumbers * parameter is set to `off' in the database configuration file and * the words begin with a digit.

*

Words that could be in the database, but are not, are also * excluded, but in this case the phrase cannot of course be matched.

*

Words ending in * or ? are considered to be wildcards; * they are expanded automatically by LQT_MakeMatchesWhere, * or you can use LQT_ExpandWildCard to iterate over all the matches. *

You can use LQT_NumberOfWordsInPhrase on the returned result, * if it is not NULL, to determine the number of words in the string * that were recognised as words that are in the database.

*

The result of LQT_StringToPhrase can be passed to * LQT_MakeMatches to find all occurrences of the phrase in the * database.

. * * the created t_Phrase, or NULL if either an error occurred or * there were no recognised words in the given String. * * LQT_MakeMatchesWhere * LQT_DestroyPhrase *
*/ API t_Phrase * LQT_StringToPhrase(db, String) t_LQTEXT_Database *db; char *String; { t_Phrase *Result; t_PhraseItem **ThisWord; t_WordInfo *LastWord = 0; t_WordInfo *WordInfo; char *q; /* not register, we take its address */ char *Start = 0; Result = (t_Phrase *) emalloc("LQT_StringToPhrase result", sizeof(t_Phrase)); Result->Next = (t_Phrase *) 0; Result->HasUnknownWords = 0; *(ThisWord = &Result->Words) = (t_PhraseItem *) 0; /* initialise ReadWord(): */ (void) LQT_ReadWordFromStringPointer(db, (char **) 0, (char **) 0, 0, 0); /* March along the supplied phrase, looking for keywords. * surround unindexed or short words with [brackets]. * Also converts to lower case and strips plurals. */ /* Start: * LastWord == 0 * WordInfo */ for (q = String; /*LOTSOFTIMES*/; LastWord = WordInfo) { t_WordInfo *W; /* Within the loop, * WordInfo points to the most recently word, or at the * end of the string is NULL * LastWord is 0 first time round, and subsequently points * to the previous word that was read from the string. */ WordInfo = LQT_ReadWordFromStringPointer( db, &q, &Start, (char *) 0, /* use trailing NUL for the end of the string */ LQT_READWORD_IGNORE_COMMON ); if (!WordInfo) { /* We have reached the end of the string. * If there were any words in it, LastWord points to * the last such word, which has not yet been added to the * phrase data structure. */ break; } if (!LastWord) { /* In this case, we have just read the first word. * We must always add the last-but-one word, not the last word, * to the data structure, so we have to look for more before * we can add this one. */ continue; } /* Here, LastWord points to a word that is not the last in the * string (otherwise WordInfo would have been NULL above). */ LastWord->WID = LQT_WordToWID(db, LastWord->Word, LastWord->Length); if (LastWord->WID == 0) { Result->HasUnknownWords++; continue; } W = LQT_WIDToWordInfo(db, LastWord->WID); if (W == (t_WordInfo *) 0 || W->NumberOfWordPlaces == 0) { /* Actually the word is known to us and is in our vocabulary, * but does not actually occur in the database. * It makes no difference, really. */ #ifdef ASCIITRACE LQT_Trace(LQTRACE_MAKE_PHRASE, "Unknown word in phrase: %s", LastWord->Word ); #endif Result->HasUnknownWords++; } else { int Flags = W->Flags; *ThisWord = (t_PhraseItem *) emalloc("Phrase Item", sizeof(t_PhraseItem)); W->WordPlace = LastWord->WordPlace; /* struct copy */ W->WordPlace.Flags |= Flags; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MAKE_PHRASE)) { LQT_Trace(LQTRACE_MAKE_PHRASE, "Word %s --> %s, %lu matches", LastWord->Word, LQT_GenerateWordFromRoot(db, W, W->WordPlace.Flags), W->NumberOfWordPlaces ); LQT_fprintWordInfo(db, stderr, W, "LQT_StringToPhrase"); } #endif /* point to the new space */ (*ThisWord)->Word = W; (*ThisWord)->WordStart = Start; (*ThisWord)->Next = (t_PhraseItem *) 0; (*ThisWord)->SearchIndex = 0L; ThisWord = &(*ThisWord)->Next; } } /* for */ if (LastWord) { LastWord->WID = LQT_WordToWID(db, LastWord->Word, LastWord->Length); if (LastWord->WID > 0) { t_WordInfo *W = LQT_WIDToWordInfo(db, LastWord->WID); if (W == (t_WordInfo *) 0) { #ifdef ASCIITRACE LQT_Trace(LQTRACE_MAKE_PHRASE, "Unknown word in phrase: %s", LastWord->Word ); #endif Result->HasUnknownWords++; } else { int Flags = W->Flags; *ThisWord = (t_PhraseItem *) emalloc("Phrase Item", sizeof(t_PhraseItem)); W->WordPlace = LastWord->WordPlace; /* struct copy */ W->WordPlace.Flags |= Flags; #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_MAKE_PHRASE)) { LQT_Trace(LQTRACE_MAKE_PHRASE, "Word %s --> %s, %lu matches", LastWord->Word, LQT_GenerateWordFromRoot(db, W, W->WordPlace.Flags), W->NumberOfWordPlaces ); LQT_fprintWordInfo(db, stderr, W, "LQT_StringToPhrase"); } #endif /* point to the new space */ (*ThisWord)->Word = W; (*ThisWord)->WordStart = Start; (*ThisWord)->Next = (t_PhraseItem *) 0; (*ThisWord)->SearchIndex = 0L; ThisWord = &(*ThisWord)->Next; } } else { Result->HasUnknownWords++; } } if (ThisWord == &Result->Words) { /* There were no words in the phrase! */ LQT_Trace(LQTRACE_MAKE_PHRASE|LQTRACE_MATCH_PHRASE, "phrase: \"%s\": no words were recognised", Result->OriginalString ); efree((char *) Result); return (t_Phrase *) 0; } Result->OriginalString = emalloc("PhraseOriginalString",strlen(String) + 1); (void) strcpy(Result->OriginalString, String); /* Originally, lq-text used to run the input filter and construct * a "canonical" string, so that you could look at it in a * user interface. * Probably all vestages of this should be dropped, but * I have retained it for now, for backward compatibility. */ Result->ModifiedString = emalloc("PhraseModifiedString",strlen(String) + 1); (void) strcpy(Result->ModifiedString, String); Result->NumberOfMatches = 0; Result->Matches = (t_MatchList *) 0; if (LQT_TraceFlagsSet(LQTRACE_MAKE_PHRASE|LQTRACE_MATCH_PHRASE)) { char *canonical; LQT_Trace(LQTRACE_MAKE_PHRASE|LQTRACE_MATCH_PHRASE, "phrase: \"%s\"", Result->OriginalString ); canonical = LQT_PhraseToString(db, Result); LQT_Trace(LQTRACE_MAKE_PHRASE|LQTRACE_MATCH_PHRASE, "interpreted as: \"%s\"", canonical ); efree(canonical); } return Result; }