/* lqrank.c -- Copyright 1983, Liam R. E. Quin. All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * $Id: lqrank.c,v 1.17 1996/07/19 21:40:11 lee Exp lee $ */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include /* stderr, also for fileinfo.h */ #include #ifdef HAVE_STRING_H # include #else # include #endif #include #include "sys/stat.h" #ifdef HAVE_FCNTL_H # include #endif #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include /* for memset */ #include "emalloc.h" #include "fileinfo.h" #include "wordinfo.h" #include "phrase.h" #include "lqutil.h" #include "liblqtext.h" #include "lqtrace.h" typedef struct s_OneMatch { long BlockInFile; unsigned long WordInBlock; int WordsInPhrase; int WhichResultSet; struct s_OneMatch *Next; /* todo: use an array for speed */ } t_OneMatch; typedef struct s_MatchesForOneDocument { char *Name; t_FID FID; t_OneMatch *Matches; long TotalHitCount; struct s_MatchesForOneDocument *Next; } t_MatchesForOneDocument; static t_MatchesForOneDocument *Results = 0; static long ResultSetCount = 0; typedef enum { e_Any, e_Most, e_All } t_RankType; /** System calls and functions... **/ /** Unix system calls used in this file: **/ /** Unix Library Functions used: **/ /** lqtext library functions: **/ /** functions used before they're defined within this file: **/ /* (see below) */ static char *Revision = "@(#) $Id: lqrank.c,v 1.17 1996/07/19 21:40:11 lee Exp lee $"; char *progname = "lqrank"; static t_RankType RankType = e_All; PRIVATE void SetRankingMode( #ifdef HAVE_PROTO char *What #endif ); PRIVATE void PrintResults(FUNCTION_WITHOUT_ARGUMENTS); PRIVATE int AddResultSetName( #ifdef HAVE_PROTO t_LQTEXT_Database *, char *Name #endif ); PRIVATE void AddResultSet( #ifdef HAVE_PROTO t_LQTEXT_Database *, FILE *f, char *Name #endif ); typedef enum { QM_PHRASE, QM_QUERY } t_QueryMode; PRIVATE void AddPhrase( #ifdef HAVE_PROTO t_LQTEXT_Database *, char *Phrase, t_QueryMode QueryMode #endif ); PRIVATE void AddOneMatch( #ifdef HAVE_PROTO long WhichResult, char *FileName, t_FID FID, t_OneMatch *mp #endif ); PRIVATE void ReadMatchesFromFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, long WhichResultSet, FILE *f, char *Name #endif ); API t_Phrase *LQT_QueryToPhraseKludge( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_LQT_Query *Query #endif ); PRIVATE void SortDocumentsByNumberOfHits(FUNCTION_WITHOUT_ARGUMENTS); typedef enum { PM_FREE, PM_NOFREE, PM_FREE_ALL } t_PM_WhetherToFree; PRIVATE void PrintMatchesInAnyFiles(FUNCTION_WITHOUT_ARGUMENTS); PRIVATE void PrintMatchesInMostFiles(FUNCTION_WITHOUT_ARGUMENTS); PRIVATE void PrintMatchesInAllFiles( #ifdef HAVE_PROTO int MustAppearInAtLeastThisManyResultSets, t_PM_WhetherToFree WhetherToFree #endif ); static char SortFlag = 1; /* default is to sort by filename within a rank */ static char SortByHits = 0; static char OneOnlyFlag = 0; /* print only one match per document */ static char OnlyOnePhraseAnywaySoWhyBother = 0; static char CountMatchesInEachDocument = 0; /** **/ int main(argc, argv) int argc; char *argv[]; { extern int optind, getopt(); extern char *optarg; int ch; int ErrorFlag = 0; t_LQTEXT_Database *db = 0; t_lqdbOptions *Options; t_QueryMode QueryMode = QM_PHRASE; progname = argv[0]; Options = LQT_InitFromArgv(argc, argv); while ((ch = getopt(argc, argv, "Zz:cf:hoPp:Qq:r:uxVv")) != EOF) { switch (ch) { case 'z': case 'Z': break; /* done by LQT_InitFromArgv(); */ case 'V': fprintf(stderr, "%s version %s\n", progname, Revision); break; case 'c': CountMatchesInEachDocument = 1; break; case 'f': if (!db) { db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db || LQT_ObtainReadOnlyAccess(db) < 0) { Error(E_FATAL, "couldn't open lq-text database"); } } if (AddResultSetName(db, optarg) < 0) { exit(1); } break; case 'h': SortByHits = 1; break; case 'o': OneOnlyFlag = 1; break; case 'P': QueryMode = QM_PHRASE; break; case 'p': if (!db) { db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db || LQT_ObtainReadOnlyAccess(db) < 0) { Error(E_FATAL, "couldn't open lq-text database"); } } AddPhrase(db, optarg, QM_PHRASE); break; case 'Q': QueryMode = QM_QUERY; break; case 'q': if (!db) { db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db || LQT_ObtainReadOnlyAccess(db) < 0) { Error(E_FATAL, "couldn't open lq-text database"); } } AddPhrase(db, optarg, QM_QUERY); break; case 'r': SetRankingMode(optarg); break; case 'u': SortFlag = 0; break; case 'x': ErrorFlag = (-1); break; case '?': ErrorFlag = 1; } } /* Normally put call to lrqError here to give a helpful message, * but not yet ready to ship the error handling package, sorry */ if (ErrorFlag) { fprintf(stderr, "Usage: %s [options] [terms]\n", progname); fprintf(stderr, "%s: options are:\n", progname); fputs(" -c -- count matches in each document\n", stderr); fputs(" -f file -- file contains matches one per line\n", stderr); fputs(" -h -- sort documents by number of hits\n", stderr); fputs(" -o -- print only one match from each document\n", stderr); fputs(" -P -- subsequent terms are phrases\n", stderr); fputs(" -p phr -- look up phr as a phrase with lqphrase(1)\n", stderr); fputs(" -Q -- subsquent terms are queries\n", stderr); fputs(" -q qry -- look up qry as a phrase with lqquery(1)\n", stderr); fputs(" -r rank -- set ranking level for subsequent files\n", stderr); fprintf(stderr, "\trank can be none, most or all.\n"); fputs(" -u -- produce unsorted output\n", stderr); LQT_PrintDefaultUsage(Options); fputs("Warning: all -f options should normally come last!\n", stderr); exit( ErrorFlag > 0 ? 1 : 0); /* 0 means -x was used */ } if (!db) { db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db || LQT_ObtainReadOnlyAccess(db) < 0) { Error(E_FATAL, "couldn't open lq-text database"); } } if (argc - optind == 1 && !STREQ(argv[optind], "-")) { if (!SortByHits && !CountMatchesInEachDocument) { OnlyOnePhraseAnywaySoWhyBother = 1; (void) AddPhrase(db, argv[optind], QueryMode); exit(0); } } if (optind < argc) { for (; optind < argc; ++optind) { if (STREQ(argv[optind], "-")) { char *theLine = 0; while (LQU_fReadLine(stdin, &theLine, LQUF_IGNBLANKS|LQUF_IGNSPACES|LQUF_IGNHASH) >= 0) { AddPhrase(db, theLine, QueryMode); theLine = 0; } } else { (void) AddPhrase(db, argv[optind], QueryMode); } } } if (ResultSetCount) { PrintResults(); exit(0); } return 1; } PRIVATE int AddResultSetName(db, Name) t_LQTEXT_Database *db; char *Name; { FILE *f; if (STREQ(Name, "-")) { AddResultSet(db, stdin, "(standard input)"); return 1; } /* The return value is the number of files indexed. */ if (!LQU_IsFile(Name)) { if (LQU_IsDir(Name)) { Error(E_WARN, "%s is a directory, not a file", Name); return 0; } Error(E_WARN, "%s is not a regular file...", Name); return 0; } f = LQU_fEopen(E_FATAL, Name, "match-list file", "r"); if (!f) { /*NOTREACHED*/ return -1; } AddResultSet(db, f, Name); fclose(f); return 1; } PRIVATE void SetRankingMode(What) char *What; { if (STREQ(What, "none") || STREQ(What, "or")) { RankType = e_Any; } else if (STREQ(What, "most") || STREQ(What, "quorum")) { RankType = e_Most; } else if (STREQ(What, "all") || STREQ(What, "and")) { RankType = e_All; } else { Error(E_FATAL|E_XHINT, "-r: option must be \"none\", \"most\" or \"all\", not \"%s\"", What ? What : "" ); } } PRIVATE void AddResultSet(db, f, Name) t_LQTEXT_Database *db; FILE *f; char *Name; { ++ResultSetCount; ReadMatchesFromFile(db, ResultSetCount, f, Name); } PRIVATE void PrintOneMatch(Mp, mp) t_MatchesForOneDocument *Mp; t_OneMatch *mp; { printf("%d %ld %ld %lu %s\n", mp->WordsInPhrase, mp->BlockInFile, mp->WordInBlock, Mp->FID, Mp->Name ); } PRIVATE void fPrintOneMatch(stream, Mp, mp) FILE *stream; t_MatchesForOneDocument *Mp; t_OneMatch *mp; { fprintf(stream, "[set=%d] %d %ld %ld %lu %s\n", mp->WhichResultSet, mp->WordsInPhrase, mp->BlockInFile, mp->WordInBlock, Mp->FID, Mp->Name ); } PRIVATE void AddOneMatch(WhichResult, FileName, FID, mp) long WhichResult; char *FileName; t_FID FID; t_OneMatch *mp; { static t_MatchesForOneDocument **Mpp = 0; static t_OneMatch **mpp = 0; static long LastResult = -1; if (!Results || WhichResult != LastResult) { LastResult = WhichResult; Mpp = &Results; mpp = 0; } /* find the right document */ for (; *Mpp; Mpp = &(*Mpp)->Next) { if ((*Mpp)->FID >= FID) { break; } mpp = 0; } if (!*Mpp || FID != (*Mpp)->FID) { /* Not found in the list of documents, so make * a new document... and insert it before *Mpp. */ t_MatchesForOneDocument *M = (*Mpp); *Mpp = (t_MatchesForOneDocument *) emalloc( "lqrank:AddMatch.Match", sizeof(t_MatchesForOneDocument) ); (*Mpp)->Next = M; (*Mpp)->Name = emalloc( "AddMatch.FileName", (unsigned int) (strlen(FileName) + 1) ); (void) strcpy((*Mpp)->Name, FileName); (*Mpp)->FID = FID; (*Mpp)->TotalHitCount = 0; (*Mpp)->Matches = 0; } /* The document has one more hit; do this before checking to see * whether the hit has already occurred; see comment below */ (*Mpp)->TotalHitCount++; /* Insert the match in sorted order */ if (!mpp) { /* mpp is static so that in the usual case of appending * to the list,everything is OK. This relies on the matches * in our input being sorted within each result set, of course. */ mpp = &(*Mpp)->Matches; } for (; *mpp; mpp = &(*mpp)->Next) { if ((*mpp)->BlockInFile > mp->BlockInFile) break; if ((*mpp)->BlockInFile == mp->BlockInFile) { if ((*mpp)->WordInBlock > mp->WordInBlock) break; if ((*mpp)->WordInBlock == mp->WordInBlock) { /* It's already there! * no need to do anything. * Question: should we increment hit count? * I say yes, as this means you can affect ranking * by using the same results twice. See comment above. * * Note that if we are requiring matches to appear in * every set, we have to add the match. Same if we're * sorting by number of matches... */ if (RankType == e_Any) { efree((char *) mp); return; } else { /* we are about to insert a duplicate, so * make sure that it does not get printed */ mp->WordsInPhrase = 0; } } } } /* insert before *mpp */ mp->Next = (*mpp); *mpp = mp; /* finished! */ } PRIVATE void ReadMatchesFromFile(db, WhichResultSet, f, Name) t_LQTEXT_Database *db; long WhichResultSet; FILE *f; char *Name; { t_OneMatch *Mp; char *theLine; for (;;) { register char *p; t_FID FID; char *FileName; if (LQU_fReadLine(f, &theLine, LQUF_IGNBLANKS|LQUF_IGNSPACES|LQUF_IGNHASH ) < 0) { if (theLine) efree(theLine); return; } if (RankType == e_Any && !SortFlag) { puts(theLine); continue; } /* LQU_fReadLine has swallowed any blank or commented lines, * and has deleted any leading spaces. */ Mp = (t_OneMatch *) emalloc( "lqrank:ReadMatchesFromFile.t_OneMatch", sizeof(t_OneMatch) ); Mp->Next = (t_OneMatch *) 0; Mp->BlockInFile = 0; Mp->WordInBlock = 0; Mp->WordsInPhrase = 0; Mp->WhichResultSet = ResultSetCount; /* Now the filename; don't use sscanf, for speed; * TODO: use LQT_StringToMatch() ? */ p = theLine; while (isspace(*p)) { p++; } for (; isdigit(*p); p++) { Mp->WordsInPhrase *= 10; Mp->WordsInPhrase += *p - '0'; } while (isspace(*p)) { p++; } if (!isdigit(*p)) { Error(E_WARN, "File %s contains bad match, expected digit in %s", theLine ); return; } for (; isdigit(*p); p++) { Mp->BlockInFile *= 10; Mp->BlockInFile += *p - '0'; } while (isspace(*p)) { p++; } if (!isdigit(*p)) { Error(E_WARN, "File %s conatins strange match, expected digit in %s", theLine ); return; } for (; isdigit(*p); p++) { Mp->WordInBlock *= 10; Mp->WordInBlock += *p - '0'; } while (isspace(*p)) { p++; } /* FID */ FID = 0L; while (isdigit(*p)) { FID *= 10; FID += *p - '0'; p++; } while (isspace(*p)) { p++; } if (!*p) { static char *theName = 0; static t_FID theFID = 0L; t_FileInfo *FileInfo; if (FID == theFID) { FileName = theName; } else { if (theName) { efree(theName); } FileInfo = LQT_FIDToFileInfo(db, FID); theFID = FID; theName = FileName = FileInfo->Name; (void) efree((char *) FileInfo); } } else { /* file name, already null-terminated */ FileName = p; } AddOneMatch( WhichResultSet, FileName, /* of the matched document */ FID, Mp ); } } PRIVATE void PrintResults(FUNCTION_WITHOUT_ARGUMENTS) { if (SortByHits) { SortDocumentsByNumberOfHits(); } switch (RankType) { case e_Any: PrintMatchesInAnyFiles(); return; case e_Most: PrintMatchesInMostFiles(); return; case e_All: PrintMatchesInAllFiles(ResultSetCount, PM_NOFREE); return; } } PRIVATE void PrintMatchesInMostFiles(FUNCTION_WITHOUT_ARGUMENTS) { int TermsToMatch; for (TermsToMatch = ResultSetCount; TermsToMatch > 0; --TermsToMatch) { PrintMatchesInAllFiles(TermsToMatch, PM_NOFREE); /* XXX */ /* I have used NOFREE here for speed */ } } PRIVATE void PrintMatchesInAnyFiles(FUNCTION_WITHOUT_ARGUMENTS) { /* for each file */ t_MatchesForOneDocument *Mp; for (Mp = Results; Mp; Mp = Mp->Next) { register t_OneMatch *mp; if (CountMatchesInEachDocument) { long total = 0; for (mp = Mp->Matches; mp; mp = mp->Next) { if (mp->WordsInPhrase) { ++total; } } printf("{ DocumentMatches = %ld }\n", total); } for (mp = Mp->Matches; mp; mp = mp->Next) { if (mp->WordsInPhrase) { PrintOneMatch(Mp, mp); if (OneOnlyFlag) break; } } } } PRIVATE int CompareSortElements(M1, M2) t_MatchesForOneDocument **M1, **M2; { /* sort in descending order of number of hits... */ return (*M2)->TotalHitCount - (*M1)->TotalHitCount; } PRIVATE void SortDocumentsByNumberOfHits(FUNCTION_WITHOUT_ARGUMENTS) { /* Sort the linked list of files. * This list may be quite long. The simplest way seems to be * to put it in an array and then reconstruct it from scratch! */ t_MatchesForOneDocument **SortMe; unsigned long DocumentCount; register t_MatchesForOneDocument *Mp; for (DocumentCount = 0, Mp = Results; Mp; Mp = Mp->Next) { ++DocumentCount; } SortMe = (t_MatchesForOneDocument **) emalloc("SortMe", sizeof(t_MatchesForOneDocument *) * (DocumentCount + 1) ); for (DocumentCount = 0, Mp = Results; Mp; Mp = Mp->Next) { SortMe[DocumentCount++] = Mp; } (void) qsort( SortMe, DocumentCount, sizeof(t_MatchesForOneDocument *), CompareSortElements ); { int i; SortMe[DocumentCount] = 0; /* a sentinel */ for (i = 0; i < DocumentCount; i++) { SortMe[i]->Next = SortMe[i + 1]; } Results = SortMe[0]; } efree((char *) SortMe); } PRIVATE void PrintMatchesInAllFiles( MustAppearInAtLeastThisManyResultSets, WhetherToFree ) int MustAppearInAtLeastThisManyResultSets; t_PM_WhetherToFree WhetherToFree; { register t_MatchesForOneDocument **Mp; t_MatchesForOneDocument **NextDocument = 0; char *ResultSets; ResultSets = (char *) emalloc("ResultSet", (unsigned) ResultSetCount + 1); /* the +1 is because sets are numbered from 1 */ /* For each document in the list of matches: */ for (Mp = &Results; *Mp; Mp = NextDocument) { register t_OneMatch *mp; int ResultCount; /* prepare to count which result sets are represented: */ memset(ResultSets, '\0', ResultSetCount + 1); ResultCount = 0; /* find which result sets are represented */ for (mp = (*Mp)->Matches; mp; mp = mp->Next) { if (ResultSets[mp->WhichResultSet] == 0) { ResultSets[mp->WhichResultSet] = 1; if (++ResultCount >= MustAppearInAtLeastThisManyResultSets) { /* they are already all accounted for */ break; } } } NextDocument = &(*Mp)->Next; /* if the document meets the requirements, print the matches */ if (ResultCount >= MustAppearInAtLeastThisManyResultSets) { if (CountMatchesInEachDocument) { /* count the matches first */ long total = 0; for (mp = (*Mp)->Matches; mp; mp = mp->Next) { if (mp->WordsInPhrase) { ++total; } } printf("{ DocumentMatches = %ld }\n", total); } for (mp = (*Mp)->Matches; mp; mp = mp->Next) { if (mp->WordsInPhrase) { PrintOneMatch(*Mp, mp); if (OneOnlyFlag) break; } } /* Reclaim storage if appropriate */ if (WhetherToFree == PM_FREE_ALL) { t_OneMatch *Next = 0; for (mp = (*Mp)->Matches; mp; mp = Next) { Next = mp->Next; efree((char *) mp); mp = Next; } (*Mp)->Matches = 0; } /* Delete this item from the chain if appropriate; * this lets us call PrintMatchesInAllFiles() multiple * times; see PrintMatchesInMostFiles() for an example. * * Note that we don't normally use FREE_ALL, because * there is no need to call free() just before we exit. * If you're using Purify, though, you'll want to. */ if (WhetherToFree != PM_NOFREE) { t_MatchesForOneDocument *This = *Mp; /* delete the match from the chain */ *Mp = *NextDocument; /* reclaim storage, perhaps */ if (WhetherToFree == PM_FREE_ALL) { efree(This->Name); } efree((char *) This); } } } /* for */ } PRIVATE void AddPhrase(db, Phrase, QueryMode) t_LQTEXT_Database *db; char *Phrase; t_QueryMode QueryMode; { t_Phrase *P = 0; t_MatchList *Matches; t_FID LastFID = ~(t_FID) 0; t_FileInfo *FileInfo = 0; int NumberOfWords = 1; if (!Phrase || !*Phrase) return; switch (QueryMode) { case QM_PHRASE: { P = LQT_StringToPhrase(db, Phrase); break; } case QM_QUERY: { register char *p; int usePhrase = 1; /* If the search contains no wildcards, we can expand it * as a phrase directly -- this is much faster. */ for (p = Phrase; *p; p++) { if (*p == '*' || *p == '?') { /* found a wildcard */ usePhrase = 0; break; } } if (usePhrase) { P = LQT_StringToPhrase(db, Phrase); } else { t_LQT_Query *oneQuery; oneQuery = LQT_ParseQuery(db, Phrase); if (oneQuery) { P = LQT_QueryToPhraseKludge(db, oneQuery); } else { P = 0; } } break; } } if (P == (t_Phrase *) 0) { Error(E_WARN, "Query \"%s\" contained no recognised words - ignored", Phrase ); NumberOfWords = 0; } else { NumberOfWords = LQT_NumberOfWordsInPhrase(db, P); if (OnlyOnePhraseAnywaySoWhyBother) { (void) LQT_MakeMatchesWhere(db, P, LQT_PrintAndRejectOneMatch); return; /* no need to free stuff, we're about to exit */ } else { if (LQT_MakeMatches(db, P) < 0L) return; } } ++ResultSetCount; if (P) { for (Matches = P->Matches; Matches != (t_MatchList *) 0; Matches = Matches->Next) { t_OneMatch *Mp; if (Matches->Match != (t_Match *) 0) { if (Matches->Match->Where->FID != LastFID) { LastFID = Matches->Match->Where->FID; if (FileInfo) { LQT_DestroyFileInfo(db, FileInfo); } FileInfo = LQT_FIDToFileInfo(db, LastFID); } if (FileInfo == (t_FileInfo *) 0) { continue; } Mp = (t_OneMatch *) emalloc( "lqrank.AddFile.Matches", sizeof(t_OneMatch) ); Mp->BlockInFile = Matches->Match->Where->BlockInFile; Mp->WordInBlock = Matches->Match->Where->WordInBlock; Mp->WordsInPhrase = NumberOfWords; Mp->WhichResultSet = ResultSetCount; AddOneMatch(ResultSetCount, FileInfo->Name, LastFID, Mp); } } } }