/* lqaddfile.c -- Copyright 1989, 1990, 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* addfile -- add a file to the LQ-Text text retrieval index * Liam Quin, August 1989 and later... * * $Id: lqaddfile.c,v 1.43 1996/08/20 18:39:32 lee Exp lee $ */ static char *Version = "@(#) $Id: lqaddfile.c,v 1.43 1996/08/20 18:39:32 lee Exp lee $"; #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #include #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifdef HAVE_FCNTL_H # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "lqutil.h" #include "liblqtext.h" #include "filter.h" #include "lqtrace.h" #include "revision.h" #define HAVE_MMAP 1 #ifdef HAVE_MMAP # include # ifndef MAP_FILE # define MAP_FILE 0 # endif #endif /* HAVE_MMAP */ /** Functions within this file that need declaring: **/ PRIVATE void AddStream( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_FileInfo *FileInfo #endif ); PRIVATE void AddFrom( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Name #endif ); /* Symbol Table Interface */ extern void AddWord( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WordInfo *WordInfo #endif ); extern void DumpCache( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); extern void SetDumpThresh( #ifdef HAVE_PROTO t_lqdbOptions *Options, int Thresh #endif ); PRIVATE int AddFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Name #endif ); /**/ char *progname = "@(#) $Id: lqaddfile.c,v 1.43 1996/08/20 18:39:32 lee Exp lee $"; #ifdef USE_LINENUMBERS static int UseLineNumbers = 0; #endif /* The database we'll operate on: */ static t_LQTEXT_Database *dbForSignalHandler; static int SignalFlag = 0; int SignalHandler() { ++SignalFlag; if (SignalFlag > 3) { LQT_CloseDatabase(dbForSignalHandler); Error(E_FATAL, "received %d signals to quit, exiting; db may be corrupt!.", SignalFlag ); } return 0; } extern int SetHashSize( #ifdef HAVE_PROTO t_lqdbOptions *Options, int theNewSize #endif ); int main(argc, argv) int argc; char *argv[]; { extern int getopt(); extern char *optarg; extern int optind; extern int MaxWordsInCache; /* see wordtable.c */ int c; int ErrorFlag = 0; int DoNothing = 0; char *InputFile = (char *) 0; t_LQTEXT_Database *db; t_lqdbOptions *Options; #ifdef MALLOCTRACE malloc_debug(2); #endif progname = argv[0]; /* retain the full path at first */ #ifdef M_MXFAST (void) mallopt(M_MXFAST, 6); /* i.e. typical word length with \0 */ /* may need to comment mallopt() out entirely for BSD -- use ifndef. * seems to work under SunOS, though. * It says "Allocate 100 or so chunks of this size at a time, and whenever * I ask for this much or less, give me one of the chunks". */ #endif Options = LQT_InitFromArgv(argc, argv); while ((c = getopt(argc, argv, "w:f:H:M:xVZz:")) != -1) { switch (c) { case 'M': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-M must be given a number >= 0, not \"%s\"", optarg ); } SetDumpThresh(Options, atoi(optarg)); break; case 'H': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-H must be given a hash table size >= 1, not \"%s\"", optarg ); } SetHashSize(Options, atoi(optarg)); break; case 'w': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-w must be given a number >= 0, not \"%s\"", optarg ); } MaxWordsInCache = atoi(optarg); break; case 'Z': case 'z': break; /* work done in SetDefault() */ case 'V': fprintf(stderr, "%s: Release: %s\n", progname, LQTEXTREVISION); fprintf(stderr, "%s: Revision: %s\n", progname, Version); DoNothing = 1; break; case 'f': if (InputFile) { Error(E_USAGE|E_XHINT|E_FATAL, "only one -f option allowed; use -xv for explanation" ); } InputFile = optarg; break; case 'x': ErrorFlag = (-1); break; default: case '?': ErrorFlag = 1; } } if ((progname = strrchr(progname, '/')) != (char *) NULL) { ++progname; /* step over the last / */ } else { progname = argv[0]; } if (ErrorFlag > 0) { fprintf(stderr, "use %s -x or %s -xv for an explanation.\n", progname, progname); exit(1); } else if (ErrorFlag < 0) { /* -x was used */ fprintf(stderr, "%s -- add files to an lq-text retrieval database\n", progname); fputs("Options are:\n\ -f file -- read the list of files to index from \"file\"\n\ -M n -- try not to flush cache entries with n or more entries\n\ -w n -- dump the word-cache every n words\n\ \n\ ", stderr); LQT_PrintDefaultUsage(Options); if (LQT_TraceFlagsSet(LQTRACE_VERBOSE)) { /* used -v or -t1 */ fprintf(stderr, "\n\ Any remaining arguments are taken to be file names. The current\n\ DOCPATH (%s) is searched for the files,\n\ and they are read and added to the index.\n\ If you use the -f option, you should not give filename\n\ arguments on the command line, although you can use \"-f -\" to read the\n\ list of files from standard input, one per line.\n\ Setting (with -w) the size of the cache may dramatically\n\ improve performance. Systems with memory larger than the data can try -w0.\n\ See %s(1) for more information.\n", (char *) LQT_GetOption(Options, "file search path"), progname ); } exit(0); } #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people to recompile... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); /* remind them a lot... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif if (DoNothing) { if (optind < argc) { Error(E_WARN|E_XHINT, "%d extra argument%s ignored...", argc - optind, argc - optind == 1 ? "" : "%s" ); } exit(0); } db = LQT_OpenDatabase(Options, O_RDWR|O_CREAT, 0664); LQT_ObtainWriteAccess(db); LQT_InitFilterTable(db); dbForSignalHandler = db; lqSetSignals(SignalHandler); if (InputFile) { if (optind < argc) { Error(E_FATAL|E_USAGE|E_XHINT, "cannot give filenames after -f %s", InputFile ); } AddFrom(db, InputFile); } else for (; optind < argc; ++optind) { if ( AddFile(db, argv[optind]) < 0 && LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG) ) { if (SignalFlag) { Error(E_WARN, "Caught signal at level %d, dumping cache", SignalFlag ); DumpCache(db, DUMP_SYNC); LQT_CloseDatabase(db); exit(1); } else { Error(E_WARN, "%s not added to index", argv[optind]); } } } #ifndef MALLOCTRACE /* don't bother recaiming storage if we're about to exit, unless we * want to check for memory leaks afterwards. */ DumpCache(db, DUMP_SYNC|DUMP_NOFREE); #else DumpCache(db, DUMP_SYNC); #endif LQT_CloseDatabase(db); #ifdef MALLOCTRACE (void) fprintf(stderr, "%s: Malloctrace: checking...\n", progname); malloc_verify(); (void) fprintf(stderr, "%s: Malloc Map\n", progname); mallocmap(); #endif #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people again to recompile... */ Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif #ifdef ASCIITRACE /* memory statistics */ if (LQT_TraceFlagsSet(LQTRACE_DEBUG)) { extern caddr_t sbrk( #ifdef HAVE_PROTO int incr #endif ); extern caddr_t etext; caddr_t top = sbrk(0); LQT_Trace(LQTRACE_DEBUG, "Memory: etext 0x%x, top 0x%x, difference %lu\n", etext, top, (top > etext) ? top - etext : etext - top ); } #endif return 0; } static void AddFrom(db, Name) t_LQTEXT_Database *db; char *Name; { FILE *fp; char *Line; if (Name[0] == '-' && Name[1] == '\0') { fp = stdin; } else { fp = LQU_fEopen(E_FATAL, Name, "list of files to add", "r"); } while (LQU_fReadLine(fp, &Line, LQUF_NORMAL) > 0) { /* Note: * LQU_fReadFile will silently swallow blank lines. * if we use LQUF_NORMAL it will swallow lines that start with a #, * but we don't want that here! */ if (AddFile(db, Line) < 0) { if (SignalFlag) { Error(E_WARN, "Caught signal at level %d -- dumping cache", SignalFlag ); DumpCache(db, DUMP_SYNC); LQT_CloseDatabase(db); exit(1); } else { if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { /* AddFile should already have printed a message... */ Error(E_WARN, "-f %s: \"%s\" not added to index", Name, Line ); } } } } if (fp != stdin) { (void) fclose(fp); } } PRIVATE int AddFile(db, Name) t_LQTEXT_Database *db; char *Name; { t_FileInfo *theFileInfo; if (!Name || !*Name) { return -1; } if ((theFileInfo = LQT_MakeFileInfo(db, Name)) == (t_FileInfo *) 0) { return -1; } AddStream(db, theFileInfo); LQT_SaveFileInfo(db, theFileInfo); LQT_DestroyFileInfo(db, theFileInfo); if (SignalFlag) { return -1; } return 0; } #define IS_REGULAR_FILE(db, FileInfo) ((FileInfo)->FilterType == 0) PRIVATE void AddStream(db, FileInfo) t_LQTEXT_Database *db; t_FileInfo *FileInfo; { /* I have to mark the last word in the block. * I do that by marking the previous word if it was in a differant block * than the current one. */ t_WordInfo *WordInfo; t_WordInfo *LastWord = 0; long wordCount = 0; char *Base = 0; char *Start; char *End = 0; #ifdef HAVE_MMAP /* If it's a regular file, we may be able to use mmap() to * bring it into memory. * It is a regular file if the open function for it is fclose. */ if (FileInfo->FileSize && IS_REGULAR_FILE(db, FileInfo)) { Base = mmap( 0, FileInfo->FileSize, PROT_READ, MAP_FILE|MAP_SHARED, fileno(FileInfo->Stream), 0 ); if (Base == (caddr_t) -1) { Base = 0; Error(E_WARN|E_SYS, "mmap(%s) failed", FileInfo->Name); } else { Start = Base; End = &Base[FileInfo->FileSize]; /* reset ReadWord: */ (void) LQT_ReadWordFromStringPointer(db, 0, 0, 0, 0); } } #endif while (SignalFlag <= 1) { /* needs more than one signal to quit in the middle of a file */ if (Base) { WordInfo = LQT_ReadWordFromStringPointer( db, &Start, (char **) NULL, End, LQT_READWORD_IGNORE_COMMON ); if (WordInfo) { WordInfo->WordPlace.FID = FileInfo->FID; } else { break; } } else { WordInfo = LQT_ReadWordFromFileInfo( db, FileInfo, LQT_READWORD_IGNORE_COMMON ); } if (WordInfo == (t_WordInfo *) 0) { break; } else { if (LastWord) { /* TODO: move this rubbish into ReadWord() and hide it! */ if (LastWord->WordPlace.BlockInFile != WordInfo->WordPlace.BlockInFile) { LastWord->WordPlace.Flags |= WPF_LASTINBLOCK; } AddWord(db, LastWord); ++wordCount; } LastWord = WordInfo; } } if (LastWord) { AddWord(db, LastWord); ++wordCount; } #ifdef HAVE_MMAP if (Base) { (void) munmap(Base, FileInfo->FileSize); Base = 0; } #endif if (SignalFlag > 1) { Error(E_WARN|E_MULTILINE, "Signal received during processing of %s", FileInfo->Name ); Error(E_WARN|E_MULTILINE|E_LASTLINE, "That and other files may be incomplete..." ); return; } if (SignalFlag <= 1 && LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%d: %s: type: %s words: %lu\n", FileInfo->FID, FileInfo->Name, LQT_GetFilterName(db, FileInfo), wordCount ); } }