/* addbyline.c -- Copyright 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* addbyline -- add a file to the LQ-Text text retrieval index * Liam Quin, August 1989 and later... * Modified to prevent phrase matches over line boundaries. * * $Id: addbyline.c,v 1.8 2001/05/31 03:50:13 liam Exp $ */ static char *Version = "@(#) $Id: addbyline.c,v 1.8 2001/05/31 03:50:13 liam Exp $"; #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_UNISTD_H # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "lqutil.h" #include "revision.h" #include "liblqtext.h" #include "lqtrace.h" #include "filter.h" /** Functions within this file that need declaring: **/ PRIVATE void AddStream( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_FileInfo *FileInfo #endif ); PRIVATE void AddFrom( #ifdef HAVE_PROTO char *Name #endif ); /* Symbol Table Interface */ extern void AddWord( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WordInfo *WordInfo #endif ); extern void DumpCache( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); extern void SetDumpThresh( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int Thresh #endif ); PRIVATE int AddFile( #ifdef HAVE_PROTO char *Name #endif ); extern int SetHashSize( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int theNewSize #endif ); /**/ char *progname = "@(#) $Id: addbyline.c,v 1.8 2001/05/31 03:50:13 liam Exp $"; #ifdef USE_LINENUMBERS static int UseLineNumbers = 0; #endif static int SignalFlag = 0; t_LQTEXT_Database *db = 0; PRIVATE int SignalHandler() { ++SignalFlag; if (SignalFlag > 3) { LQT_CloseDatabase(db); Error(E_FATAL, "received %d signals to quit, exiting; db may be corrupt!.", SignalFlag ); } return 0; } int main(argc, argv) int argc; char *argv[]; { extern int getopt(); extern char *optarg; extern int optind; extern int MaxWordsInCache; /* see wordtable.c */ t_lqdbOptions *Options; int c; int ErrorFlag = 0; int DoNothing = 0; char *InputFile = (char *) 0; #ifdef MALLOCTRACE malloc_debug(2); #endif progname = argv[0]; /* retain the full path at first */ #ifdef M_MXFAST (void) mallopt(M_MXFAST, 6); /* i.e. typical word length with \0 */ /* may need to comment mallopt() out entirely for BSD -- use ifndef. * seems to work under SunOS, though. * It says "Allocate 100 or so chunks of this size at a time, and whenever * I ask for this much or less, give me one of the chunks". */ #endif Options = LQT_InitFromArgv(argc, argv); while ((c = getopt(argc, argv, "w:f:H:M:xVZz:")) != -1) { switch (c) { case 'M': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-M must be given a number >= 0, not \"%s\"", optarg ); } SetDumpThresh(db, atoi(optarg)); break; case 'H': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-H must be given a hash table size >= 1, not \"%s\"", optarg ); } SetHashSize(db, atoi(optarg)); break; case 'w': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-w must be given a number >= 0, not \"%s\"", optarg ); } MaxWordsInCache = atoi(optarg); break; case 'Z': case 'z': break; /* work done in SetDefault() */ case 'V': fprintf(stderr, "%s: Release: %s\n", progname, LQTEXTREVISION); fprintf(stderr, "%s: Revision: %s\n", progname, Version); DoNothing = 1; break; case 'f': if (InputFile) { Error(E_USAGE|E_XHINT|E_FATAL, "only one -f option allowed; use -xv for explanation" ); } InputFile = optarg; break; case 'x': ErrorFlag = (-1); break; default: case '?': ErrorFlag = 1; } } if ((progname = strrchr(progname, '/')) != (char *) NULL) { ++progname; /* step over the last / */ } else { progname = argv[0]; } if (ErrorFlag > 0) { fprintf(stderr, "use %s -x or %s -xv for an explanation.\n", progname, progname); exit(1); } else if (ErrorFlag < 0) { /* -x was used */ fprintf(stderr, "%s -- add files to an lq-text retrieval database\n", progname); fputs("Options are:\n\ -f file -- read the list of files to index from \"file\"\n\ -M n -- try not to flush cache entries with n or more entries\n\ -w n -- dump the word-cache every n words\n\ \n\ ", stderr); LQT_PrintDefaultUsage(Options); if (LQT_TraceFlagsSet(LQTRACE_VERBOSE)) { /* used -v or -t1 */ fprintf(stderr, "\n\ Any remaining arguments are taken to be file names. The current\n\ DOCPATH (%s) is searched for the files,\n\ and they are read and added to the index.\n\ If you use the -f option, you should not give filename\n\ arguments on the command line, although you can use \"-f -\" to read the\n\ list of files from standard input, one per line.\n\ Setting (with -w) the size of the cache may dramatically\n\ improve performance. Systems with memory larger than the data can try -w0.\n\ See %s(1) for more information.\n", Options->filesearchpath.Value, progname ); } exit(0); } #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people to recompile... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); /* remind them a lot... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif if (DoNothing) { if (optind < argc) { Error(E_WARN|E_XHINT, "%d extra argument%s ignored...", argc - optind, argc - optind == 1 ? "" : "%s" ); } exit(0); } if (!(db = LQT_OpenDatabase(Options, O_RDWR|O_CREAT, 0664))) { Error(E_FATAL, "couldn't write to database."); } LQT_ObtainWriteAccess(db); LQT_InitFilterTable(db); lqSetSignals(SignalHandler); if (InputFile) { if (optind < argc) { Error(E_FATAL|E_USAGE|E_XHINT, "cannot give filenames after -f %s", InputFile ); } AddFrom(InputFile); } else for (; optind < argc; ++optind) { if ( AddFile(argv[optind]) < 0 && LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG) ) { if (SignalFlag) { Error(E_WARN, "Caught signal at level %d, dumping cache", SignalFlag ); DumpCache(db, DUMP_SYNC); LQT_CloseDatabase(db); exit(1); } else { Error(E_WARN, "%s not added to index", argv[optind]); } } } #ifndef MALLOCTRACE /* don't bother recaiming storage if we're about to exit, unless we * want to check for memory leaks afterwards. */ DumpCache(db, DUMP_SYNC|DUMP_NOFREE); #else DumpCache(db, DUMP_SYNC); #endif LQT_CloseDatabase(db); #ifdef MALLOCTRACE (void) fprintf(stderr, "%s: Malloctrace: checking...\n", progname); malloc_verify(); (void) fprintf(stderr, "%s: Malloc Map\n", progname); mallocmap(); #endif #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people again to recompile... */ Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif return 0; } static void AddFrom(Name) char *Name; { FILE *fp; char *Line; if (Name[0] == '-' && Name[1] == '\0') { fp = stdin; } else { fp = LQU_fEopen(E_FATAL, Name, "list of files to add", "r"); } while (LQU_fReadLine(fp, &Line, LQUF_NORMAL) > 0) { /* Note: * LQU_fReadFile will silently swallow blank lines. * if we use LQUF_NORMAL it will swallow lines that start with a #, * but we don't want that here! */ if (AddFile(Line) < 0) { if (SignalFlag) { Error(E_WARN, "Caught signal at level %d -- dumping cache", SignalFlag ); DumpCache(db, DUMP_SYNC); LQT_CloseDatabase(db); exit(1); } else { if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { /* AddFile should already have printed a message... */ Error(E_WARN, "-f %s: \"%s\" not added to index", Name, Line ); } } } } if (fp != stdin) { (void) fclose(fp); } } PRIVATE int AddFile(Name) char *Name; { t_FileInfo *theFileInfo; if (!Name || !*Name) { return -1; } if ((theFileInfo = LQT_MakeFileInfo(db, Name)) == (t_FileInfo *) 0) { return -1; } AddStream(db, theFileInfo); LQT_SaveFileInfo(db, theFileInfo); LQT_DestroyFileInfo(db, theFileInfo); if (SignalFlag) { return -1; } return 0; } PRIVATE void AddStream(db, FileInfo) t_LQTEXT_Database *db; t_FileInfo *FileInfo; { /* I have to mark the last word in the block. * I do that by marking the previous word if it was in a differant block * than the current one. */ char *Line; long lineNumber; int Len; lineNumber = 0; /* Read lines one at a time into Line and then read the words from * those lines. * Note: LQU_fReadFile() uses a private (but growable) buffer. */ while ((Len = LQU_fReadLine(FileInfo->Stream, &Line, 0)) != -1) { char *Start; t_WordInfo *WordInfo; t_WordInfo *LastWord = 0; ++lineNumber; if (!Line || !*Line || Len < 0) { continue; } if (db->FileBlockSize <= Len) { db->FileBlockSize = Len + 1; } /* reset the word-reading routine */ (void) LQT_ReadWordFromStringPointer( db, (char **) NULL, (char **) NULL, (char *) NULL, 0 ); /* add the words in this line, one at a time. * We are always one word behind, because when ReadWord * finds punctuation after a word, it sets the flag in the * previous word's WordPlace... so we have to leave it in place * to get set! */ Start = Line; LastWord = (t_WordInfo *) 0; while (SignalFlag <= 1) { /* needs more than one signal to quit in the middle of a file */ WordInfo = LQT_ReadWordFromStringPointer( db, &Start, (char **) NULL, &Line[Len], LQT_READWORD_IGNORE_COMMON ); if (WordInfo == (t_WordInfo *) NULL) { break; } else { WordInfo->WordPlace.BlockInFile = lineNumber; WordInfo->WordPlace.FID = FileInfo->FID; if (LastWord) { AddWord(db, LastWord); } LastWord = WordInfo; } } if (LastWord) { /* ensure that the WPF_LASTINBLOCK flag is not set */ LastWord->WordPlace.Flags &= ~WPF_LASTINBLOCK; LastWord->WordPlace.BlockInFile = lineNumber; LastWord->WordPlace.FID = FileInfo->FID; AddWord(db, LastWord); LastWord = (t_WordInfo *) 0; } if (SignalFlag > 1) { break; } } if (SignalFlag > 1) { Error(E_WARN, "Signal received during processing of %s", FileInfo->Name); Error(E_WARN, "That and other files may be incomplete..."); return; } if (SignalFlag <= 1 && LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%d: %s: indexed.\n", FileInfo->FID, FileInfo->Name ); } }