/* lqword.c -- Copyright 1989, 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * lqword -- simple program to print information about individual words. * * $Id: lqword.c,v 2.27 1996/08/14 17:03:25 lee Exp lee $ */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #ifdef HAVE_SYSV_FCNTL_H # include # include #endif #ifdef HAVE_FCNTL_H #include /* for fileinfo.h */ #endif #include #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_LIMITS_H # include /* for USI_MAX, the largest unsigned integer. */ #endif #ifndef USI_MAX # define USI_MAX ((unsigned int) -1) #endif #include "fileinfo.h" #include "wordinfo.h" #include "smalldb.h" #include "pblock.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" /*** Declarations: ***/ /** System calls and library routines: **/ /** System calls: **/ /** Unix Library Functions: **/ #ifndef tolower extern int tolower( #ifdef HAVE_PROTO int c #endif ); #endif /** functions defined within this file: */ PRIVATE void PrintWIDInfo( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WID WID #endif ); PRIVATE void PrintWordInfo( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Word #endif ); PRIVATE void AllWordInfo( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int Verbose #endif ); PRIVATE void DisplayWordInfo( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WordInfo *WordInfo, int Verbose #endif ); PRIVATE void ShowWordList( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WordInfo *WordInfo #endif ); PRIVATE void dbmmarch( #ifdef HAVE_PROTO t_LQTEXT_Database *db #endif ); /** Macros and variable definitions **/ #define DISPLAY_ALL 1 #define DISPLAY_NAME 2 /* These are the possible DisplayMode values -- see main() */ char *progname = 0; /* Used for error messages */ int SilentMode = 0; /* Set if we were invoked with the -s option. In this mode, we behave * like grep -s, and exit with a zero exit status if one or more of * the words were found in the database. */ int ListMode = 0; /* Set if we are to provide a terser output format suitable for use * with lqshow(1L). */ int DoNames = 1; int CountFiles = 0; static char *Revision = "lqword 2.2"; /** end of declarations... **/ int main(argc, argv) int argc; char *argv[]; { extern int optind, getopt(); /* For getopt(3) */ extern char *optarg; /* For getopt(3) */ int ch; /* For getopt(3) */ int ErrorFlag = 0; /* For getopt(3) */ int DisplayMode = 0; /* DisplayMode indicates what kind of information we are to * print in response to queries. The values understood are * the DISPLAY_* constants. Perhaps this should be an enum. */ t_LQTEXT_Database *db; /* a pointer to the actual database... */ t_lqdbOptions *Options; /* The options from the preferences file are read and returned * when we deal with command-line options. */ int OpenFlags = O_RDONLY; int OpenPermission = 0; /* we won't try and create anything! */ t_WID WIDtoFind = 0; progname = argv[0]; /* I see this as a library program, so I am leaving the full * path. lqaddfile(1L) and lqphrase(1L) set progname to be * the filename of the command, rather than the full pathname. */ Options = LQT_InitFromArgv(argc, argv); /* Deal with any arguments that are understood by all lqtext * programs. */ while ((ch = getopt(argc, argv, "aACD:lNsVW:xZz:")) != EOF) { switch (ch) { case 'a': DisplayMode = DISPLAY_NAME; break; case 'A': DisplayMode = DISPLAY_ALL; break; case 'C': CountFiles = 1; break; case 'D': OpenFlags |= O_RDWR; break; case 'l': ListMode = 1; break; case 'N': DoNames = 0; break; case 's': SilentMode = 1; break; case 'V': fprintf(stderr, "%s version %s\n", progname, Revision); break; case 'W': WIDtoFind = atol(optarg); break; case 'x': ErrorFlag++; break; case '?': ErrorFlag++; break; case 'z': case 'Z': break; /* done by LQT_InitFromArgv(); */ } } /* Normally put call to lrqError here to give a helpful message, * but not yet ready to ship the error handling package, sorry */ if (ErrorFlag) { fprintf(stderr, "%s: options are:\n", progname); fputs("\ -D -- delete the given words from the database (!)\n\ -l -- list mode, for use with lqshow\n\ -s -- silent mode (like grep -s)\n\ -W WID -- print information for the given WID\n", stderr); LQT_PrintDefaultUsage(Options); fputs("\n\ In addition, if no words are given, the following are understood:\n\ -a -- print all words\n\ -A -- print all matches to all words\n", stderr); exit(1); } db = LQT_OpenDatabase(Options, OpenFlags, OpenPermission); if (!db) { Error(E_FATAL, "unable to open lq-text database in directory \"%s\"", LQT_GetOption(Options, "directory") ); } if (optind >= argc) { if (SilentMode) { /* if there were no words given, none of them matched. * It could be argued that this case should be an error. */ exit(1); } if (DisplayMode) { if (!SilentMode && !ListMode) { /* Print some pretty headers */ int i; if (CountFiles) { printf("WID\t%-*s\tmatches\tfiles\n", db->MaxWordLength + 2, "Word" ); } else { for (i = 0; i < db->MaxWordLength; i++) { putchar('='); } printf("+=====+===+====+===+======="); if (DoNames) { printf(" ===================="); } putchar('\n'); for (i = 4; i < db->MaxWordLength; i++) { putchar(' '); } /* "+=====+===+====+===+======+====" */ printf("Root|Block|WIB|Flag|Sep|FileID|Type"); if (DoNames) { printf("File Name"); } putchar('\n'); } } AllWordInfo(db, DisplayMode); } else { /* In this case, there were no command-line options and no * display-mode flags, so we do the default thing. * This happens to be to print every word in the database. * This is probably bogus behaviour -- there should be a better * way of finding words that match a given pattern than using * lqword | grep * which is what this allows. */ dbmmarch(db); } } else { if (!SilentMode && !ListMode) { /* Print some pretty headers */ int i; if (CountFiles) { printf("WID\t%-*s\tmatches\tfiles\n", db->MaxWordLength + 2, "Word" ); } else { for (i = 0; i < db->MaxWordLength; i++) { putchar('='); } printf("+=====+===+====+===+======="); if (DoNames) { printf("+===================="); } putchar('\n'); for (i = 4; i < db->MaxWordLength; i++) { putchar(' '); } /* "+=====+===+====+===+======+====" */ printf("Root|Block|WIB|Flag|Sep FileID|Type"); if (DoNames) { printf("File Name"); } putchar('\n'); } } if (WIDtoFind) { printf("Information for WID %ld:\n", WIDtoFind); PrintWIDInfo(db, WIDtoFind); } while (optind < argc) { PrintWordInfo(db, argv[optind++]); } } LQT_CloseDatabase(db); exit(SilentMode); /* 0 or 1 (this is a little devious) */ /*NOTREACHED*/ return 1; /* this is for versions of lint and gcc that don't understand * that exit() doesn't return -- or, if it does, that there is * nothing that can be done about it! */ } PRIVATE void PrintWIDInfo(db, WID) t_LQTEXT_Database *db; t_WID WID; { t_WordInfo *WordInfo; if ((WordInfo = LQT_WIDToWordInfo(db, WID)) == (t_WordInfo *) 0) { if (!SilentMode) { /* In this case the word is in the database (since it has * a non-zero WID), but not in the word index. This might * happen if the word is being deleted (or added) by someone * else at this very moment, or if the database is corrupt. */ Error(E_WARN, "No index information for: WID %lu"); } return; } if (SilentMode && WordInfo->NumberOfWordPlaces > 0) { /* We found something, so there is no point looking further -- * we already know enough to exit. If a lot of words are given, * this could be a big efficiency win. */ exit(0); } /** Now we have the database entry for the word, so let's print it! **/ DisplayWordInfo(db, WordInfo, DISPLAY_ALL); if (WordInfo) { LQT_DestroyWordInfo(db, WordInfo); } } PRIVATE void PrintWordInfo(db, Word) t_LQTEXT_Database *db; char *Word; { register char *p; char *q; t_WID WID; t_WordInfo Root; static char *LongerWord = 0; Root.WordPlace.Flags = 0; if (!LongerWord) { LongerWord = emalloc("PrintWordInfo", db->MaxWordLength + 10); } /** Find the canonical form of the word, with plurals reduced to the ** singular and letters folded into lower case. **/ /* First, remember if the word originally started with an upper case * letter: */ if (isupper(*Word)) { Root.WordPlace.Flags |= WPF_UPPERCASE; } /* now convert to lower case and measure its length at the same time: */ for (q = LongerWord, p = Word; *p; q++, p++) { *q = isupper(*p) ? tolower(*p) : *p; } *q = '\0'; Root.Length = p - Word; Root.Word = LongerWord; /* Now call LQT_ReduceWordToRoot() to find the canonical form: */ Word = LQT_WORDROOT(db, &Root); /** Now see if the canonical word is too common to list: **/ if (LQT_WordIsInStopList(db, &Root)) { /* It is listed in the common word list, so don't bother looking * it up at all */ if (!SilentMode) { Error(E_WARN, "No index information for: %s (too common)", Word); } return; } /** It is not too common, so look it up: **/ if ((WID = LQT_WordToWID(db, Word, Root.Length)) == (t_WID) 0) { /* In this case the word is neither listed as common nor * found in the database. Either it was spelt differently * there or it isn't there at all. */ Error(E_WARN, "No index information for: %s", Word); } else { PrintWIDInfo(db, WID); } } /* DisplayWordInfo() -- print information about a single word */ PRIVATE void DisplayWordInfo(db, WordInfo, Verbose) t_LQTEXT_Database *db; t_WordInfo *WordInfo; int Verbose; { char *Buf = emalloc("lqword:WordBuf", WordInfo->Length + 1); /* Words in a t_WordInfo might not be null terminated, since the * storage overhead and the work of putting the nulls there turns out to * be significant... */ (void) strncpy(Buf, WordInfo->Word, WordInfo->Length); Buf[WordInfo->Length] = '\0'; if (CountFiles) { printf("%lu\t%-*s\t%ld\t", (unsigned long) WordInfo->WID, db->MaxWordLength + 2, WordInfo->Word, (unsigned long) WordInfo->NumberOfWordPlaces ); } else if (!ListMode) { /* Print a little header for the word, unless we were asked not to */ printf("# %s Offset %7lu Nocc %7lu WID %ld\n", WordInfo->Word, WordInfo->Offset, WordInfo->NumberOfWordPlaces, WordInfo->WID ); } if ((CountFiles || ListMode || Verbose == DISPLAY_ALL) && WordInfo->NumberOfWordPlaces) { /* If there are occurrences in the database (there might not be if * the word has been deleted, or has only just been added), * and we want all the matches, * then print the list of matches in the appropriate format: */ ShowWordList(db, WordInfo); } (void) efree(Buf); /* reclaim storage */ } static long FilesWithThisWord = 0; static char *theWord; static int theWordLength; PRIVATE int PrintPlace(db, WID, WordPlace) t_LQTEXT_Database *db; t_WID WID; t_WordPlace *WordPlace; { static t_FID LastFID = USI_MAX; /* This is not a plausible FID (File IDentifier), so it * will force a call to LQT_FIDToFileInfo() in the loop below. */ static unsigned int LastFlags = 256 * 128; /* Similarly, this is an impossible flag value, since the * flags are constrained to fit in a single byte. */ static char *LastRoot = "[internal error lqword.c 392]"; /* the message is in case I make a coding error!. The number * was once the line number of the message, but it only needs to * be a distinct enough message to search for. */ static t_FileInfo *FileInfo = (t_FileInfo *) 0; static t_WID LastWID = (t_WID) -1; char BIF[100]; char WIB[100]; register char *p; char *Bp, *Wp; long l; if (WID != LastWID || LastFlags != WordPlace->Flags) { t_WordInfo W; LastWID = WID; W.Word = theWord; W.Length = theWordLength; LastFlags = WordPlace->Flags; LastRoot = LQT_GenerateWordFromRoot(db, &W, LastFlags); /* LQT_GenerateWordFromRoot takes a canonical (singular, * lower-case) word and a set of flags, and reverses the * transformations implied by the flags. For example, * if WordInfo->Word is "boy" and flags contain the * Plural flag, you should get "boys" returned. * Since we don't remember whether a word was in all * caps or had only the first letter capitalised (at * the moment, anyway), the routine will return Boys * even if the input was BOYS or BoYs. * Possessives (the boy's books) may also be indicated. * * A pointer to a static buffer is returned. */ } if (LastFID != WordPlace->FID || FileInfo == (t_FileInfo *) 0) { /* The first part of the test means we don't call the * function to retrieve the file name lots of times if * there are multiple matches in the same data file. * This turns out to be a common case. */ /* Reclaim storage */ if (FileInfo) { LQT_DestroyFileInfo(db, FileInfo); FileInfo = 0; } /* Find the file name from the FID. This routine should * be called FID2FileName(), and may in fact be renamed * in the future. */ if (DoNames) { if ((FileInfo = LQT_FIDToFileInfo(db, LastFID = WordPlace->FID)) == (t_FileInfo *) 0) { /* No filename information available. This sometimes * happens if you run lqword diring an lqaddfile * session and match a word in one of the new files. * Note that if the output is for reuse, we don't * want to include references to files whose names * we don't have! */ if (!ListMode) { printf("%20s | %-.5lu/%-.3lu | [FID %ld]\n", LastRoot, WordPlace->BlockInFile, WordPlace->WordInBlock, WordPlace->FID ); } return 0; } } ++FilesWithThisWord; } if (CountFiles) return 0; /* This is an inline printf, because otherwise this call * to printf takes over 20% of the execution time, and nearly * 40% for a frequent word (e.g. over 1000 places) !! */ /* Block In File */ p = &BIF[sizeof(BIF) - 1]; *p = '\0'; if (WordPlace->BlockInFile == 0) { *--p = '0'; } else for (l = WordPlace->BlockInFile; l; l /= 10) { *--p = "0123456789"[l % 10]; } Bp = p; p = &WIB[sizeof(WIB) - 1]; *p = '\0'; { register unsigned long L = WordPlace->WordInBlock; if (L == 0) { *--p = '0'; } else for (; L; L /= 10) { *--p = "0123456789"[L % 10]; } Wp = p; } if (ListMode) { /* number of words in the phrase is One: */ putchar('1'); putchar(' '); while (*Bp) { putchar(*Bp); Bp++; } putchar(' '); while (*Wp) { putchar(*Wp); Wp++; } putchar(' '); printf("%ld ", WordPlace->FID); if (DoNames) { puts(FileInfo->Name); } else { putchar('\n'); } } else { /* Well, if we are not reusing the output, maybe the speed * is not quite so critical... */ printf("%*s %5lu %3lu %d %s %ld %d", db->MaxWordLength, LastRoot, WordPlace->BlockInFile, WordPlace->WordInBlock, WordPlace->StuffBefore, LQT_WordFlagsToString(db, (t_WordFlags) WordPlace->Flags), WordPlace->FID, FileInfo->FilterType ); if (DoNames) { putchar(' '); puts(FileInfo->Name); } else { putchar('\n'); } } return 0; } PRIVATE void ShowWordList(db, WordInfo) t_LQTEXT_Database *db; t_WordInfo *WordInfo; { t_pblock *pblock = (t_pblock *) 0; t_WordPlace *PP = (t_WordPlace *) 0; int Place; FilesWithThisWord = 0L; /* set the global variables (ugh) for the callback; the next * round of API changes will fix this, I hope. */ theWord = WordInfo->Word; theWordLength = WordInfo->Length; if (WordInfo->WordPlacesInHere >= WordInfo->NumberOfWordPlaces) { /* In this case, the match info all fits in the index, so it * does not matter if automatic pre-fetching from the overflow * file "data" happens or not (i.e. if we are using Lazy Evaluation, * it doesn't happen, but it makes no difference in this case). */ PP = WordInfo->WordPlaces; } else { /* If Lazy Evaluation is enabled (the default), liblqtext might not * have fetched all of the match information from the * overflow file ("data"), in which case we must do it now: */ pblock = LQT_GetpblockWhere(db, WordInfo, PrintPlace); /* this has printed all of the places... */ PP = (t_WordPlace *) 0; } if (PP) { /* cycle through the Place... */ for (Place = 0; Place < WordInfo->NumberOfWordPlaces; Place++) { PrintPlace(db, WordInfo->WID, &PP[Place]); } } if (CountFiles) { printf("%ld\n", FilesWithThisWord); } if (pblock) { /* If we had to go and get the matches ourselves, we had better * release the storage. * Actually we should also be freeing the FileInfo and possibly * the WordInfo as well, but the pblock is the biggest... and I * am only adding comments today, not fixing code (I hope)... * NOTDONE FIXME */ (void) efree((char *)pblock); } } PRIVATE void AllWordInfo(db, Verbose) t_LQTEXT_Database *db; int Verbose; { t_WID i; t_WID MaxWid = LQT_GetMaxWID(db); t_WordInfo *WordInfo; /* Loop over all possible WID numbers and print information * for each of them. */ for (i = (t_WID) 1; i <= MaxWid; i++) { if ((WordInfo = LQT_WIDToWordInfo(db, i)) != (t_WordInfo *) 0) { DisplayWordInfo(db, WordInfo, Verbose); LQT_DestroyWordInfo(db, WordInfo); } } /* for each WID */ if (!ListMode) { printf("Maximum WID is %lu\n", MaxWid); } } /* dbmmarch -- print every value in a dbm database. This might go * wrong (omitting some values) if the database is being concurrently * updated. */ PRIVATE void dbmmarch(db) t_LQTEXT_Database *db; { DBM *w; datum d; datum v; /* for dbnative only */ if ((w = LQT_OpenKeyValueDatabase(db, db->WordIndex)) == (DBM *) 0) { /* WordIndex is the list of words, defined in "globals.h". * If we didn't open it, the user probably has not set * $LQTEXTDIR, or didn't use the -d database-dir option that * is handled bu LQT_InitFromArgv() called from main(). */ Error(E_FATAL, "Can't open database file \"%s\"", db->WordIndex); } #ifdef dbnative for (w->seq(w, &d, &v, R_FIRST) == 0; d.dptr != (char *) 0 && d.dsize != 0; w->seq(w, &d, &v, R_NEXT) == 0 ) { #else /* The word database contains WID-->word matches, that look like * (key = "Word", content = WID) */ for (d = dbm_firstkey(w); d.dptr != (char *) 0 && d.dsize != 0; d = dbm_nextkey(w) ) { #endif register char *s; register char *start; /* IMPORTANT NOTE: * The words are not nul-terminated in the database. It is * therefore not safe to use printf() or puts() unless we make * a copy or are careful... */ for (start = s = d.dptr; s - start < d.dsize; s++) { putchar(*s); } putchar('\n'); #ifdef dbnative } #else } #endif LQT_CloseKeyValueDatabase(w); }