/* lqaddfile.c -- Copyright 1989, 1990, 1995, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* addfile -- add a file to the LQ-Text text retrieval index * Liam Quin, August 1989 and later... * * $Id: lqaddfile.c,v 1.41 96/07/27 01:38:12 lee Exp $ */ static char *Version = "@(#) $Id: lqaddfile.c,v 1.41 96/07/27 01:38:12 lee Exp $"; #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #include #include #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #endif #include #ifdef HAVE_FCNTL_H # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "lqutil.h" #include "liblqtext.h" #include "filter.h" #include "lqtrace.h" #include "revision.h" /** Functions within this file that need declaring: **/ PRIVATE void AddStream( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_FileInfo *FileInfo #endif ); /* Symbol Table Interface */ extern void AddWord( #ifdef HAVE_PROTO t_LQTEXT_Database *db, t_WordInfo *WordInfo #endif ); extern void DumpCache( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int CallFree #endif ); extern void SetDumpThresh( #ifdef HAVE_PROTO t_lqdbOptions *Options, int Thresh #endif ); PRIVATE int AddFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *Name #endif ); /**/ char *progname = "@(#) $Id: lqaddfile.c,v 1.41 96/07/27 01:38:12 lee Exp $"; #ifdef USE_LINENUMBERS static int UseLineNumbers = 0; #endif /* The database we'll operate on: */ static t_LQTEXT_Database *dbForSignalHandler; static int SignalFlag = 0; int SignalHandler() { ++SignalFlag; if (SignalFlag > 3) { LQT_CloseDatabase(dbForSignalHandler); Error(E_FATAL, "received %d signals to quit, exiting; db may be corrupt!.", SignalFlag ); } return 0; } extern int SetHashSize( #ifdef HAVE_PROTO t_lqdbOptions *Options, int theNewSize #endif ); static char IAmAChildDaemon = 0; typedef struct { char isRunning; FILE *input; #ifdef HAVE_PIPE int pid; #endif char *inputFile; int optind; int argc; char **argv: } t_ThingStream; typedef struct { char *DocumentName; /* what the user gave */ char *FileName; /* where we actually found it */ int FileType; } t_Thing; PRIVATE t_Thing * getThingToIndex(findFileDaemon) t_ThingStream *findFileDaemon; { char *Line; int len; if (!findFileDaemon->isRunning) { return (char *) NULL; } if (!findFileDaemon->input) { return (char *) NULL; } /* The 0 in the call to LQU_fReadLine() prevents it from interpreting * a # at the start of a line as beginning a comment. * If LQUF_IGNSPACE were to be used instead, LQU_fReadLine would also * elide leading & trailing spaces from the input. */ while ((len = LQU_fReadLine(findFileDaemon->input, &Line, 0)) != -1) { if (!Line || !*Line || !Line[0]) { continue; } if (len < 6) { Error(E_WARN|E_INTERNAL|E_BUG, "protocol error: findFileDaemon returned [%s]", Line ); continue; } /* the daemon can pass warnings through to us so that we can * report them -- since it;s a separate process, its own * warnings would be randomly interspersed with ours, making * them hard to read. */ if (STRNCMP(Line, "warn\t", 5) == 0) { Error(E_WARN, "%s", &Line[6]); continue; } if (STRNCMP(Line, "add\t", 4) == 0) { static t_Thing Result; char *p; /* The format is * addtypenamelocation\n * where type, is an integer offset into the filter table; * name is the filename as given by the user; * location is the full pathname where we found the file */ Result.FileType = 0; for (p = &Line[4]; ; p++) { if (isdigit(*p)) { Result.FileType *= 10; Result.FileType += *p - '0'; } else if (*p == '\t') { break; } else { Error(E_WARN|E_INTERNAL|E_BUG, "findFileDaemon protocol error after type in [%s]", Line ); continue; } } /* ASSERT: *p == \t */ p++; Result.FileName = p; while (*p && *p != '\t) { p++; } if (*p != '\t') { Error(E_WARN|E_INTERNAL|E_BUG, "findFileDaemon protocol error after fn in [%s]", Line ); continue; } *p = '\0'; ++p; if (!*p) { Error(E_WARN|E_INTERNAL|E_BUG, "findFileDaemon protocol error (empty docname) in [%s]", Line ); continue; } Result.DocumentName = p; if (Result.Line) { /* It's left over from last time we were called */ efree(Result.Line); } /* save the line, and free it later: */ Result.Line = LQU_StealReadLineBuffer(); return &Result; } } /* reached EOF */ /* close the file descriptor, so we don't get a broken pipe signal */ if (findFileDaemon->input) { (void) fclose(findFileDaemon->input); findFileDaemon->input = 0; } /* signal EOF by returning NULL: */ return (char *) NULL; } PRIVATE void destroyFindFileDaemon(findFileDaemon) t_ThingStream *findFileDaemon; { int status; if (!findFileDaemon) { Error(E_BUG|E_FATAL|E_INTERNAL, "%s: %d: destroyFindFileDaemon(NULL)", __FILE__, __LINE__ ); } if (!findFileDaemon->isRunning) { return; } if (findFileDaemon->input) { (void) fclose(findFileDaemon->input); findFileDaemon->input = 0; } if (findFileDaemon->pid > 0) { (void) kill(9, findFileDaemon->pid); } else { return; /* already done */ } for (;;) { int thePid; char *name; thePid = wait(&status); if (thePid == findFileDaemon->pid) { name = "findFile daemon"; } else if (thePid == -1) { /* no more children, so ours has already gone... */ Error(E_WARN, "findFileDaemon went away silently..."); findFileDaemon->pid = 0; findFileDaemon->isRunning = 0; return; } else { name = "unknown child process"; } switch (status & 0377) { case 0177: /* a child was suspended -- we don't care. * probably it's being traced or debugged. */ if (thePid == findFileDaemon->pid) { Error(E_BUG|E_WARN, "destroyFindFileDaemon: daemon %d is stopped!", thePid ); continue; } break; case 0: /* it died by calling exit */ { char *msg = ""; int result = (status >> 8) & 0377; if (status & 0200) { msg = " (memory image saved to \"core\" for debugging)"; } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s: %d: exit %d%s", name, thePid, result, msg ); } break; default: /* it died from a signal */ LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s: %d: killed by signal %d", name, thePid, status & 0177 ); } if (thePid == findFileDaemon->pid) { findFileDaemon->pid = 0; findFileDaemon->isRunning = 0; break; } } /* forever */ } PRIVATE t_ThingStream * startFindFileDaemon(db, Options, argc, argv, inputFile) t_LQTEXT_Database *db; t_lqdbOptions *Options; int argc; char *argv[]; char *inputFile; { static t_ThingStream Me; int forkedOK = 0; int io[2]; *pidp = -1; t_FileInfo *FileInfo; Me.pid = 0; Me.input = 0; Me.isRunning = 0; Me.inputFile = inputFile; Me.argc = argc; Me.argv = argv: /* optind is used to record how far we've got processing arguments */ Me.optind = 0; /* There are two strategies hidden here. * (1) If we can do forking (e.g. we're on Unix), we * start up a separate process, the findFileDaemon. * The findFileDaemon looks at each filename we're given, * finds it somewhere in the database document path, and tries to * determine its file type by opening it and inspecting it, or by * using its suffix. * Whenever it suceeds, it writes out the follwing: * the file type (an integer) * the filename as given * the filename actually found * onto a pipe that the parent addfile process is reading. * * The parent calls getThingToIndex() repeatedly to read this * information and index the file. This strategy means that while * the findFileDaemon is waiting for an open() or stat() or whatever, * the parent can be doing indexing -- and vice versa. * * The parent has to do a single open on each file, but the inode will * usually be in the cache, unless the files are really big, in which * case the extra overhead is insignificant anyway. * * (2) if there is no forking available (e.g. on Windows NT), we * simply save state in the struct we return, and do all the work in * getThingToIndex(), returning the same object as before. */ #ifdef HAVE_FORK if (pipe(io) == -1) { Error(E_WARN|E_SYS, "startFindFileDaemon: couldn't make pipe"); /* no point making a fork now... */ forkedOK = 0; } else { Me.pid = fork(); if (Me.pid == -1) { Error(E_WARN|E_SYS, "startFindFileDaemon: couldn't fork"); forkedOK = 0; } else if (Me.pid != 0) { /* parent: we've started the child successfully: */ Me.isRunning = 1; /* we'll read from one end of the pipe... */ Me.input = fdopen(io[0], "r"); Me.output = (FILE *) NULL; /* ... and close the end of the pipe that you write into: */ (void) close(fd[1]); /* Me now encapsulates everything we need. */ return &Me; } else { /* child process */ forkedOK = 1; /* The child doesn't want the reading end of the pipe */ (void) close(io[0]); Me.input = (FILE *) NULL; Me.output = fdopen(io[1], "w"); /* fall through and continue */ } } if (InputFile) { FILE *fp; char *Line; if (Name[0] == '-' && Name[1] == '\0') { fp = stdin; } else { fp = LQU_fEopen(E_FATAL, Name, "list of files to add", "r"); } while (LQU_fReadLine(fp, &Line, 0) != -1) { if (forkedOK) { FileInfo = oneFileNameForDaemon(db, InputFile); if (FileInfo) { fprintf(Me.output, "%d\t%s\t%s\n", FileInfo->FilterType, Line, FileInfo->Name; ); } } else { AddFile(db, Line); } } if (fp != stdin) { fclose(fp); } } else { int optind; for (optind = 0; optind < argc; ++optind) { if (forkedOK) { oneFileName(db, InputFile); } else { AddFile(db, Line); } } } if (forkedOK) { /* We are the child process, and we've finished. * The parent will kill us eventually, but in the * mean-time, let's sleep. * This is because if the child dies first, the parent on * some systems may get a broken pipe message. * TODO: trap SIG_PIPE?? * So we wait meekly and humbly to be executed... * * Note that we won't get a chance to close the database -- so * it's fortunate that we have not yet asked for write access... */ for (;;) { (void) sleep(32760); /* wait 8 hours or so, */ /* I am using 32760 in case some systems only allow a * 16-bit argument here... */ } } } int main(argc, argv) int argc; char *argv[]; { extern int getopt(); extern char *optarg; extern int optind; extern int MaxWordsInCache; /* see wordtable.c */ int c; int ErrorFlag = 0; int DoNothing = 0; char *InputFile = (char *) 0; t_LQTEXT_Database *db; t_lqdbOptions *Options; FILE *findFileDaemon; #ifdef MALLOCTRACE malloc_debug(2); #endif progname = argv[0]; /* retain the full path at first */ #ifdef M_MXFAST (void) mallopt(M_MXFAST, 6); /* i.e. typical word length with \0 */ /* may need to comment mallopt() out entirely for BSD -- use ifndef. * seems to work under SunOS, though. * It says "Allocate 100 or so chunks of this size at a time, and whenever * I ask for this much or less, give me one of the chunks". */ #endif Options = LQT_InitFromArgv(argc, argv); while ((c = getopt(argc, argv, "w:f:H:M:xVZz:")) != -1) { switch (c) { case 'M': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-M must be given a number >= 0, not \"%s\"", optarg ); } SetDumpThresh(Options, atoi(optarg)); break; case 'H': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-H must be given a hash table size >= 1, not \"%s\"", optarg ); } SetHashSize(Options, atoi(optarg)); break; case 'w': if (!LQU_cknatstr(optarg)) { Error(E_FATAL|E_USAGE|E_XHINT, "-w must be given a number >= 0, not \"%s\"", optarg ); } MaxWordsInCache = atoi(optarg); break; case 'Z': case 'z': break; /* work done in SetDefault() */ case 'V': fprintf(stderr, "%s: Release: %s\n", progname, LQTEXTREVISION); fprintf(stderr, "%s: Revision: %s\n", progname, Version); DoNothing = 1; break; case 'f': if (InputFile) { Error(E_USAGE|E_XHINT|E_FATAL, "only one -f option allowed; use -xv for explanation" ); } InputFile = optarg; break; case 'x': ErrorFlag = (-1); break; default: case '?': ErrorFlag = 1; } } if ((progname = strrchr(progname, '/')) != (char *) NULL) { ++progname; /* step over the last / */ } else { progname = argv[0]; } if (ErrorFlag > 0) { fprintf(stderr, "use %s -x or %s -xv for an explanation.\n", progname, progname); exit(1); } else if (ErrorFlag < 0) { /* -x was used */ fprintf(stderr, "%s -- add files to an lq-text retrieval database\n", progname); fputs("Options are:\n\ -f file -- read the list of files to index from \"file\"\n\ -M n -- try not to flush cache entries with n or more entries\n\ -w n -- dump the word-cache every n words\n\ \n\ ", stderr); LQT_PrintDefaultUsage(Options); if (LQT_TraceFlagsSet(LQTRACE_VERBOSE)) { /* used -v or -t1 */ fprintf(stderr, "\n\ Any remaining arguments are taken to be file names. The current\n\ DOCPATH (%s) is searched for the files,\n\ and they are read and added to the index.\n\ If you use the -f option, you should not give filename\n\ arguments on the command line, although you can use \"-f -\" to read the\n\ list of files from standard input, one per line.\n\ Setting (with -w) the size of the cache may dramatically\n\ improve performance. Systems with memory larger than the data can try -w0.\n\ See %s(1) for more information.\n", (char *) LQT_GetOption(Options, "file search path"), progname ); } exit(0); } #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people to recompile... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); /* remind them a lot... */ Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); sleep(5); Error(E_WARN, "**** Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif if (DoNothing) { if (optind < argc) { Error(E_WARN|E_XHINT, "%d extra argument%s ignored...", argc - optind, argc - optind == 1 ? "" : "%s" ); } exit(0); } /* some checking first: */ if (InputFile && optind < argc) { Error(E_FATAL|E_USAGE|E_XHINT, "cannot give filenames after -f %s", InputFile ); } } /* OK, now open the database; * We specify O_CREAT so that we will create a new database if * there isn't one there already. As a sanity check, * LQT_InitFromArgv() has already checked that there is a * config.txt file in the database directory, so it's a plausible * place in which to create a new database if we have to. * We will end up creating a few files (depending on how * lq-text was compiled) in that directory -- typically anywhere from * five up to about a dozen files. */ db = LQT_OpenDatabase(Options, O_RDWR|O_CREAT, 0664); /* The filter table is not initialised by default, in order to avoid * linking in all the filter code. Only part of it is initialised; * just enough for read-only access. So we need to initialise the rest, * because we will be using input filters: */ LQT_InitFilterTable(db); /* arrange to catch interrupts */ dbForSignalHandler = db; lqSetSignals(SignalHandler); /* We create a FindFileDaemon object; this may be implemented as * a separate thread or a separate process entirely on some systems. * We hand it all of our remaining filename arguments to process. */ findFileDaemon = startFindFileDaemon( db, Options, argc - optind, &argv[optind], InputFile ); /* Even though we specified O_RDWR, we need to ask explicitly for * write access. The modes you give to LQT_OpenDatabase are saved * for future use, and LQT_OpenDatabase checks that you could get * the requested access if you tried, but doesn't guarantee that you * have it. This is so that multiple database writers could be * supported, although they aren't right now, and also so that you * can switch between read & write modes. */ LQT_ObtainWriteAccess(db); while ((thing = getThingToIndex(findFileDaemon))) { AddThing(db, thing); if (SignalFlag) { Error(E_WARN, "Caught signal at level %d, dumping cache", SignalFlag ); DumpCache(db, DUMP_SYNC); destroyFindFileDaemon(findFileDaemon); LQT_CloseDatabase(db); exit(1); } } destroyFindFileDaemon(findFileDaemon); #ifndef MALLOCTRACE /* don't bother recaiming storage if we're about to exit, unless we * want to check for memory leaks afterwards. */ DumpCache(db, DUMP_SYNC|DUMP_NOFREE); #else DumpCache(db, DUMP_SYNC); #endif LQT_CloseDatabase(db); #ifdef MALLOCTRACE (void) fprintf(stderr, "%s: Malloctrace: checking...\n", progname); malloc_verify(); (void) fprintf(stderr, "%s: Malloc Map\n", progname); mallocmap(); #endif #ifdef WIDINBLOCK # ifdef ASCIITRACE /* remind people again to recompile... */ Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); Error(E_WARN, "Reminder: Compiled with -DWIDINBLOCK for debugging ****"); # else /* don't allow -DWIDINBLOCK without -DASCIITRACE */ Error(E_BUG, "Compiled with -DWIDINBLOCK but not -DASCIITRACE!"); syntax error; this prevents compilation here; # endif /* ASCIITRACE */ #endif return 0; } PRIVATE t_FileInfo * FileNameToFileInfoIfNoDaemon(db, FileName, thingStream) t_LQTEXT_Database *db; char *FileName; t_ThingStream *thingStream; { struct stat StatBuf; char *doc; t_FileInfo *FileInfo; /* This routine is called if we don't have a findFileDaemon. * I'd like to coalesce this with oneFileNameForDaemon() really, * but I need to do some profiling first and it's esier to profile them * this way. */ if ((doc = LQT_FindAndStatFile(db, FileName, &StatBuf)) == (char *) 0) { Error(E_WARN, "Can't find document \"%s\"", FileName); return (t_FileInfo *) 0; } if (StatBuf.st_size == 0L) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s empty -- not indexed", FileName ); return (t_FileInfo *) 0; } /* Allocate Structure */ FileInfo = (t_FileInfo *) emalloc("MakeFileInfo", sizeof(t_FileInfo)); /* Although not always necessary, call emalloc here so that a * FileInfo can always be deleted with LQT_DestroyFileInfo() */ FileInfo->Name = emalloc( "MakeFileInfo.Name", (unsigned)(strlen(FileName) + 1) ); (void) strcpy(FileInfo->Name, FileName); /* Other bits to set: */ FileInfo->Date = StatBuf.st_mtime; FileInfo->FileSize = StatBuf.st_size; FileInfo->Stream = 0; /* file type */ FileInfo->FilterType = LQT_GetFilterType(db, FileInfo, &StatBuf); if (FileInfo->FilterType < 0) { if (thingStream->output) { fprintf(thingStream->output, "warn\t%s unknown file type -- not indexed", FileName ); } else { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%s unknown file type -- not indexed", FileName ); } LQT_DestroyFileInfo(db, FileInfo); return (t_FileInfo *) 0; } FileInfo->FID = 0; /* unknown */ FileInfo->Date = (long) time((long *) 0); /* it's a time_t on BSD */ FileInfo->Stream = 0L; return FileInfo; } PRIVATE void oneFileNameForDaemon(db, FileName, thingStream) t_LQTEXT_Database *db; char *FileName; t_ThingStream *thingStream; { struct stat StatBuf; char *doc; int FilterType; /* This function is called by the child findFileDaemon process. * its job is merely to try and open each file in turn that will * be indexed. */ if ((doc = LQT_FindFile(db, FileName, &StatBuf)) == (char *) 0) { fprintf(thingStream->output, "warn\tCan't find document \"%s\"\n", FileName ); return; } if (StatBuf.st_size == 0L) { fprintf(thingStream->output, "warn\t%s empty -- not indexed", FileName ); return; } /* file type */ FilterType = LQT_GetFilterType(db, FileInfo, &StatBuf); if (FilterType < 0) { fprintf(thingStream->output, "warn\t%s unknown file type -- not indexed", FileName ); return; } /* OK, we have the information we need */ fprintf(thingStream->output, "add\t%d\t%s\t%s\n", FilterType, FileName, doc ); } PRIVATE int AddThing(db, Thing) t_LQTEXT_Database *db; t_Thing *Thing; { t_FileInfo *theFileInfo; if (!Thing) { Error(E_FATAL|E_BUG|E_INTERNAL, "%s: %d: AddThing: Attempt to add Null Thing", __FILE__, __LINE__ ); } theFileInfo = LQT_MakeFileInfo( db, Thing->FileName, Thing->Location, Thing->Type, Thing->StatBuf ); if (theFileInfo == (t_FileInfo *) 0) { return -1; } AddStream(db, theFileInfo); LQT_SaveFileInfo(db, theFileInfo); LQT_DestroyFileInfo(db, theFileInfo); if (SignalFlag) { return -1; } return 0; } PRIVATE int AddFile(db, Name) t_LQTEXT_Database *db; char *Name; { t_FileInfo *theFileInfo; if (!Name || !*Name) { return -1; } if ((theFileInfo = LQT_MakeFileInfo(db, Name)) == (t_FileInfo *) 0) { return -1; } AddStream(db, theFileInfo); LQT_SaveFileInfo(db, theFileInfo); LQT_DestroyFileInfo(db, theFileInfo); if (SignalFlag) { return -1; } return 0; } PRIVATE void AddStream(db, FileInfo) t_LQTEXT_Database *db; t_FileInfo *FileInfo; { /* I have to mark the last word in the block. * I do that by marking the previous word if it was in a differant block * than the current one. */ t_WordInfo *WordInfo; t_WordInfo *LastWord = 0; while (SignalFlag <= 1) { /* needs more than one signal to quit in the middle of a file */ WordInfo = LQT_ReadWordFromFileInfo( db, FileInfo, LQT_READWORD_IGNORE_COMMON ); if (WordInfo == (t_WordInfo *) 0) { break; } else { if (LastWord) { if (LastWord->WordPlace.BlockInFile != WordInfo->WordPlace.BlockInFile) { LastWord->WordPlace.Flags |= WPF_LASTINBLOCK; } AddWord(db, LastWord); } LastWord = WordInfo; } } if (SignalFlag > 1) { Error(E_WARN|E_MULTILINE, "Signal received during processing of %s", FileInfo->Name ); Error(E_WARN|E_MULTILINE|E_LASTLINE, "That and other files may be incomplete..." ); return; } if (LastWord) { /* it's the last in the file, so it is also the last in the block */ LastWord->WordPlace.Flags |= WPF_LASTINBLOCK; AddWord(db, LastWord); } if (SignalFlag <= 1 && LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "%d: %s: indexed, %s\n", FileInfo->FID, FileInfo->Name, LQT_GetFilterName(db, FileInfo) ); } }