/* lqsimilar.c -- Copyright 1994 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: lqsimilar.c,v 1.3 2001/05/31 03:50:13 liam Exp $ * * lqsimilar, part of Liam Quin's text retrieval package... * * Find documents that contain text similar to a given query. * Compare this with lq-text/src/lqtext/lqphrase.c for more insight * as to how to use the API. * */ #include "error.h" #include "globals.h" /* defines and declarations for database filenames */ #include /* stderr, also for fileinfo.h */ #include #ifdef HAVE_SYSV_FCNTL_H # include #endif #ifdef HAVE_FCNTL_H #include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "emalloc.h" /* for efree() */ #include "fileinfo.h" /* for wordinfo.h */ #include "wordinfo.h" #include "pblock.h" #include "phrase.h" #include "lqutil.h" #include "lqtrace.h" #include "liblqtext.h" /** functions used before they're defined within this file: **/ PRIVATE void MatchOnePhrase( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Phrase #endif ); /** **/ static char *Revision = "@(#) $Id: lqsimilar.c,v 1.3 2001/05/31 03:50:13 liam Exp $"; char *progname = "$Revision: 1.3 $"; int SilentMode = 0; /* don't print matches if set to one */ static int MinimumNumberOfWordsInRankedPhrase = 1; int main(argc, argv) int argc; char *argv[]; { extern int optind, getopt(); extern char *optarg; int ch; int ErrorFlag = 0; char *InputFile = 0; t_lqdbOptions *Options; t_LQTEXT_Database *db; progname = argv[0]; Options = LQT_InitFromArgv(argc, argv); while ((ch = getopt(argc, argv, "Zz:af:hNpr:slxVv")) != EOF) { switch (ch) { case 'z': case 'Z': break; /* done by LQT_InitFromArgv(); */ case 'V': fprintf(stderr, "%s version %s\n", progname, Revision); break; case 'f': InputFile = optarg; break; case 'l': break; /* list mode is the default */ case 'r': /* ignored for compat. with lqrank*/ break; case 's': SilentMode = 1; break; case 'x': ErrorFlag = (-1); break; case '?': ErrorFlag = 1; } } if (ErrorFlag) { fprintf(stderr, "Usage: %s [options] \"phrase\" [...]\n", progname); fprintf(stderr, "%s: options are:\n", progname); fputs("\ -l -- list mode, suitable for lqshow (the default)\n\ -s -- silent mode; exit status indicates success of matching\n\ \n", stderr); LQT_PrintDefaultUsage(Options); exit( ErrorFlag > 0 ? 1 : 0); /* 0 means -x was used */ } db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db || LQT_ObtainReadOnlyAccess(db) < 0) { Error(E_FATAL, "couldn't open lq-text database"); } if (InputFile) { FILE *f; char NeedClose = 1; char *theLine; Error(E_WARN, "program not yet written to handle -f properly!"); if (STREQ(InputFile, "-")) { f = stdin; NeedClose = 0; } else { f = LQU_fEopen(E_FATAL|E_SYS, InputFile, "List of phrases", "r"); } while (LQU_fReadLine(f, &theLine, LQUF_NORMAL) >= 0) { if (theLine && *theLine) { MatchOnePhrase(db, theLine); } } if (NeedClose) { (void) close(f); } } while (optind < argc) { MatchOnePhrase(db, argv[optind++]); } if (SilentMode) { /* if we got to here we didn't find anything */ exit(1); } return 0; } static int NumberOfWords = 1; typedef struct s_Rank { t_FID FID; unsigned long Sum; } t_Rank; PRIVATE void MatchOnePhrase(db, Phrase) t_LQTEXT_Database *db; char *Phrase; { t_Phrase *P; if (!Phrase || !*Phrase) { /* ignore an empty phrase */ return; } if ((P = LQT_StringToPhrase(db, Phrase)) == (t_Phrase *) 0) return; /* Count words in the phrase: */ /* NumberOfWords is global to this file, so that the * callback PrintAndAcceptOneMatch() can use it: */ NumberOfWords = LQT_NumberOfWordsInPhrase(db, P); if (NumberOfWords < MinimumNumberOfWordsInRankedPhrase) { Error(E_WARN, "Phrase doesn't contain enough indexed words to rank: \"%s\"", Phrase ); return; } { t_PhraseElement *tp; long matchCount = 0; long RanksAllocated = 30; int n; long Count; t_Rank *Ranks = (t_Rank *) emalloc( "Ranking array", sizeof(t_Rank) * RanksAllocated ); tp = LQT_AllPhrasesOfLengthNOrMore( db, MinimumNumberOfWordsInRankedPhrase, Phrase, &Count ); if (tp) { t_PhraseElement *PEP; for (PEP = tp; PEP - tp < Count; PEP++) { t_MatchList *Matches; NumberOfWords = 0; for (Matches = PEP->Phrase->Matches; Matches != (t_MatchList *) 0; Matches = Matches->Next ) { if (Matches->Match == (t_Match *) 0) { /* this happens because of a bug in * LQT_MakeMatchesWhere(), which should * be fixed. */ continue; } for (n = 0; n < matchCount; n++) { if (Matches->Match->Where->FID == Ranks[n].FID) { break; } } if (n == matchCount) { if (n >= RanksAllocated) { /* allocate increasingly large chunks: */ RanksAllocated += (RanksAllocated/5) + 1; Ranks = (t_Rank *) erealloc( (char *) Ranks, sizeof(t_Rank) * RanksAllocated ); } ++matchCount; Ranks[n].FID = Matches->Match->Where->FID; Ranks[n].Sum = 0L; } if (!NumberOfWords) { NumberOfWords = LQT_NumberOfWordsInPhrase(db, P); } Ranks[n].Sum += NumberOfWords * NumberOfWords; } } for (n = 0; n < matchCount; n++) { printf("# FID %ld rank %ld\n", Ranks[n].FID, Ranks[n].Sum ); } for (PEP = tp; PEP - tp < Count; PEP++) { t_MatchList *Matches; static t_FileInfo *FileInfo = 0; #if 0 printf("# %d: %*.*s\n", PEP - tp, PEP->PhraseEnd - PEP->PhraseStart, PEP->PhraseEnd - PEP->PhraseStart, PEP->PhraseStart ); #endif NumberOfWords = LQT_NumberOfWordsInPhrase(db, PEP->Phrase); for (Matches = PEP->Phrase->Matches; Matches != (t_MatchList *) 0; Matches = Matches->Next ) { if (Matches->Match == (t_Match *) 0) { /* this happens because of a bug in * LQT_MakeMatchesWhere(), which should * be fixed. */ continue; } if (!FileInfo || FileInfo->FID != Matches->Match->Where->FID ) { if (FileInfo) { LQT_DestroyFileInfo(db, FileInfo); } FileInfo = LQT_FIDToFileInfo( db, Matches->Match->Where->FID ); } printf("%d %lu %lu %lu %s\n", NumberOfWords, Matches->Match->Where->BlockInFile, Matches->Match->Where->WordInBlock, FileInfo->FID, FileInfo->Name ); } } } else { printf("# no match for %s!\n", Phrase); } } return; }