/* lqsed.c -- Copyright 1994, 1996 Liam R. E. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * lqsed -- copy a file, changing the matched text as specified * Liam R. E. Quin, April 1994 and later... * * $Id: lqsed.c,v 1.11 2001/05/31 03:50:13 liam Exp $ * */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include /* for fileinfo.h */ #ifndef FILE # include #endif #ifdef HAVE_STRING_H # include #else # include #endif #ifndef O_RDONLY # ifdef HAVE_FCNTL_H # ifdef HAVE_SYSV_FCNTL_H # include # endif # include # endif #endif #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "range.h" #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" #include "wmoffset.h" #include "lqtrace.h" typedef struct s_ChangeList { unsigned long BlockInFile; unsigned long WordInBlock; int WordCount; char *OpenString; char *CloseString; struct s_ChangeList *Next; } t_ChangeList; typedef enum { e_Before, e_After } t_BeforeOrAfter; typedef struct s_OneChangeList { t_BeforeOrAfter Where; char *Position; char *TextToInsert; long MatchNumber; } t_OneChangeList; static char *DefaultLeftString = ">>["; static char *DefaultRightString = "]<<"; static long FileCount = 0; /** Unix system calls that need declaring: **/ /** Unix/C Library Functions that need declaring: **/ /** lqtext library functions that need declaring: **/ /** Functions within this file that are used before being defined: **/ PRIVATE long fReadMatchFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, FILE *FileWithMatches, char *MatchFileName, char *OutputDirectory #endif ); PRIVATE int doOneFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, int LineNumber, t_ChangeList *ChangeList, char *DocumentName, t_FID FID, char *OutputDirectory #endif ); PRIVATE int ReadMatchFile( #ifdef HAVE_PROTO t_LQTEXT_Database *db, char *MatchFileName, char *OutputDirectory #endif ); /** **/ extern int errno; static int MatchesHaveWordCount = 1; t_Range *RangeOfMatchesToPrint = (t_Range *) 0; t_Range *RangeOfFilesToPrint = (t_Range *) 0; char *progname = "@(#) $Revision: 1.11 $"; /* set from argv[] in main() */ int main(argc, argv) int argc; char *argv[]; { extern int optind, getopt(); extern char *optarg; /* for getopt */ int ch; /* for getopt */ int ErrFlag = 0; /* see how getopt makes programs cleaner? */ char *FileWithMatches = (char *) 0; int ThereWereProblems = 0; char *PathPrefix = "lqsedded"; t_LQTEXT_Database *db; t_lqdbOptions *Options; progname = argv[0]; Options = LQT_InitFromArgv(argc, argv); if (LQT_TraceFlagsSet(LQTRACE_DEBUG)) { progname = argv[0]; } else { progname = strrchr(argv[0], '/'); if (progname) { ++progname; /* skip the leading / */ } else { progname = argv[0]; } } /* All lq-text programs must call LQT_InitFromArgv() before getopt, and * must then be prepared to ignore options z with arg and Z without. */ while ((ch = getopt(argc, argv, "f:l:Mo:O:P:r:Vvxz:Z")) != EOF) { switch (ch) { case 'Z': case 'z': break; /* done by LQT_InitFromArgv(); */ case 'V': fprintf(stderr, "%s version $Revision: 1.11 $\n", progname); break; case 'f': FileWithMatches = optarg; break; case 'l': DefaultLeftString = optarg; break; case 'M': MatchesHaveWordCount = 0; break; case 'o': RangeOfMatchesToPrint = LQU_StringToRange(optarg); break; case 'O': RangeOfFilesToPrint = LQU_StringToRange(optarg); break; case 'P': PathPrefix = optarg; break; case 'r': DefaultRightString = optarg; break; case 'x': ErrFlag = (-1); break; default: Error(E_WARN, "option -%c is not recognised by this program", ch ); case '?': ErrFlag = 1; } } if (ErrFlag < 0) { /* -x or -xv was used */ fprintf(stderr, "usage: %s [-xv] [options]\n", progname); fprintf(stderr, "use %s -x or -xv for more explanations.\n", progname); if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { fprintf(stderr, "\ -f file -- \"file\" contains a list of matches, one per line\n\ -l str -- set default left string to `str' [%s]\n\ -M -- Match format is that of lqtext 1.10 (obsolete!)\n\ -o range - substitute only for matches falling within the given range\n\ -O range - process only documents falling within the given range\n", DefaultLeftString ); fprintf(stderr, "\ The format of a range is:\n\ -12,14-16,130-200,301,400 417 796,1003,1800-\n\ where a leading -12 means to print everything up to and\n\ including the twelth item;\n\ a trailing 1800- means to print item 1800 and following;\n\ 14-27 means to print items 14, 15 and 16;\n\ the other numbers standing for themselves;\n\ commas (,) and spaces are interchangeable at pleasure.\n\n" ); } fprintf(stderr, "\ -P path -- prefix output files with \"path/\" [%s] (use - for stdout)\n\ -r str -- set default right string to `str' [%s]\n", PathPrefix, DefaultRightString ); if (LQT_TraceFlagsSet(LQTRACE_VERBOSE|LQTRACE_DEBUG)) { fputs("\ Matches should be in the form of\n\ NumberOfWordsInPhrase BlockNumber WordInBlock FID FileName[\\tSub]\n\ This format is produced by lqrank, lqword -l, and lqphrase;\n\ (the -M option indicates that NumberOfWordsInPhrase is omitted)\n\ The optional [\\tSub] here is a tab followed by a Substitution, which is\n\ is of the form @left@#right#, where @ and # are any non-blank characters\n\ (they can also be the same as each other), and left and right are strings\n\ that will be inserted before and after that particular match. If Sub\n\ is not given, the Default Left and Right Strings (-l and -r) will be used.\n\ Example: @[[@#]]# will turn a matched phrase into a [[matched phrase]].\n\ You can use C-style \\-escapes, and \\e is ASCII ESC, in the Substitution.\n\ ", stderr); } LQT_PrintDefaultUsage(Options); exit(0); } else if (ErrFlag > 0) { fprintf(stderr, "use %s -x for an explanation.\n", progname); exit(1); } db = LQT_OpenDatabase(Options, O_RDONLY, 0); if (!db) { Error(E_FATAL, "couldn't open database."); } /* check that we can get at the file containing the matches, if one * was supplied. */ if (FileWithMatches) { if (ReadMatchFile(db, FileWithMatches, PathPrefix) < 0) { ThereWereProblems = 1; } } if (optind < argc) { while (optind < argc) { if (ReadMatchFile(db, argv[optind], PathPrefix) < 0) { ThereWereProblems = 1; } ++optind; } } else if (FileWithMatches == (char *) 0) { if (ReadMatchFile(db, "-", PathPrefix) < 0) { ThereWereProblems = 1; } } LQT_CloseDatabase(db); return ThereWereProblems; } PRIVATE int ReadMatchFile(db, MatchFileName, OutputDirectory) t_LQTEXT_Database *db; char *MatchFileName; char *OutputDirectory; { int OK = 0; if (*MatchFileName == '-' && MatchFileName[1] == '\0') { if (fReadMatchFile(db, stdin, "standard input", OutputDirectory) < 0) { Error(E_WARN, "problem encountered reading matches from standard input" ); OK = (-1); } } else { FILE *f = LQU_fEopen( E_WARN, MatchFileName, "list of matches", "r" ); if (!f) { return -1; } /* Now read the file, and make an array of matches... */ if (fReadMatchFile(db, f, MatchFileName, OutputDirectory) < 0) { Error(E_WARN, "problem encountered reading matches from file `%s'", MatchFileName ); OK = (-1); } (void) fclose(f); } return OK; } PRIVATE void FreeChangeList(ChangeList) t_ChangeList *ChangeList; { while (ChangeList) { t_ChangeList *Next = ChangeList->Next; if (ChangeList->OpenString != DefaultLeftString) { efree(ChangeList->OpenString); } if (ChangeList->CloseString != DefaultRightString) { efree(ChangeList->CloseString); } efree((char *) ChangeList); ChangeList = Next; } } PRIVATE long fReadMatchFile(db, FileWithMatches, MatchFileName, OutputDirectory) t_LQTEXT_Database *db; FILE *FileWithMatches; char *MatchFileName; char *OutputDirectory; { long MatchCount = 0; char *Line; int NumberOfWords = 1; long BlockInFile; char *FileName = 0; t_FID FID = (t_FID) -1; t_FID PreviousFID = (t_FID) -1; unsigned long WordInBlock; char *PreviousFileName = 0; t_ChangeList *ChangeList = 0; t_ChangeList **ChangeListNextp = &ChangeList; int ThereWereProblems = 0; int PreviousWordInBlock = 0; long PreviousBlockInFile = 0L; if (!FileWithMatches) { Error(E_BUG, "%s; %d: match-list file (from -f \"%s\") has NULL file!", __FILE__, __LINE__, MatchFileName ? (*MatchFileName ? MatchFileName : "[empty]") : "[null]" ); } while (LQU_fReadLine( FileWithMatches, &Line, LQUF_IGNBLANKS|LQUF_IGNSPACES|LQUF_IGNHASH|LQUF_ESCAPEOK ) >= 0) { register char *p; char *leftSub; char *rightSub; p = Line; ++MatchCount; leftSub = (char *) 0; rightSub = (char *) 0; /* ASSERT: There are no leading or trailing spaces on the line */ if (!*p || *p == '#') { continue; /* blank line */ } if (*p == '{') { /* ignore Glue Expression terminated by }\n */ continue; } if (MatchesHaveWordCount) { if (!isdigit(*p)) { Error(E_WARN, "%s: %ld: Bad match format (expected digit) in %s", MatchFileName, MatchCount, Line ); return -1; } NumberOfWords = 0; while (isdigit(*p)) { NumberOfWords *= 10; NumberOfWords += *p++ - '0'; } while (isspace(*p)) { p++; } } /* block in file */ BlockInFile = 0L; while (isdigit(*p)) { BlockInFile *= 10; BlockInFile += *p - '0'; p++; } while (isspace(*p)) { p++; } /* Word In Block */ WordInBlock = 0L; while (isdigit(*p)) { WordInBlock *= 10; WordInBlock += *p - '0'; p++; } while (isspace(*p)) { p++; } /* file identifier (FID) */ FID = 0L; while (isdigit(*p)) { FID *= 10; FID += *p - '0'; p++; } while (isspace(*p)) { p++; } /* filename */ if (!FID && !*p) { Error(E_WARN, "%s: %ld: bad match format, neither FID nor filename: %s", MatchFileName, MatchCount, Line ); ThereWereProblems = 1; continue; } FileName = p; /* file name, already null-terminated */ if (*p) { register char *q; for (q = p; *q; q++) { if (*q == '\t') { break; } } if (*q == '\t') { if (isspace(q[1])) { Error(E_WARN, "%s: %ld: Unexpected space after tab in match %*s>> <<%s", MatchFileName, MatchCount, p - Line, Line, q ); } else if (!q[1]) { Error(E_WARN, "%s: %ld: Unexpected end of line after tab in match %s", MatchFileName, MatchCount, Line ); ThereWereProblems = 1; continue; } else { int Delim; *q = '\0'; /* zero out the tab */ Delim = *++q; /* we've already ascertained that it's not a \0 */ ++q; if (!*q) { Error(E_WARN, "%s: %ld: Unexpected end of line after left Sub delim %c", MatchFileName, MatchCount, Delim ); ThereWereProblems = 1; continue; } leftSub = q; while (*q) { if (*q == Delim) { break; } q++; } if (*q != Delim) { Error(E_WARN, "%s: %ld: Unexpected end of line inside left Sub after %c", MatchFileName, MatchCount, Delim ); ThereWereProblems = 1; continue; } *q = '\0'; q++; if (*q) { /* look for right close */ Delim = (*q); ++q; if (!*q) { Error(E_WARN, "%s: %ld: Unexpected end of line in right Sub after %c", MatchFileName, MatchCount, Delim ); ThereWereProblems = 1; continue; } rightSub = q; while (*q) { if (*q == Delim) { break; } q++; } if (!*q) { Error(E_WARN, "%s: %ld: Unexpected end of line in Right Sub `%s...'", MatchFileName, MatchCount, rightSub ); ThereWereProblems = 1; continue; } if (q[1]) { Error(E_WARN, "%s: %ld: Unexpected text after right sub: %s", MatchFileName, MatchCount, q ); ThereWereProblems = 1; continue; } *q = '\0'; } } } } /* The FID can be 0 if the filename is given; * the filename can be omitted if the FID is given. */ if (!*FileName || !FID) { static t_FID theFID = 0L; if (!FID) { /* filename and no FID */ if (PreviousFileName && STREQ(FileName, PreviousFileName)) { FID = theFID; } else { FID = LQT_NameToFID(db, FileName); } } else { /* p is null, FID and no filename */ if (FID == theFID) { FileName = PreviousFileName; } else { t_FileInfo *FileInfo = LQT_FIDToFileInfo(db, FID); if (!FileInfo) { ThereWereProblems = 1; continue; } theFID = FID; FileName = FileInfo->Name; (void) efree((char *) FileInfo); } } } /* If the FID is different from the previous value, * we have either (1) just read the first line, or * (2) just finished reading matches for the first file. * In case (1) we should just carry on, but in this case, * ChangeList has not been allocated yet (it happens below, and * this is the first match). */ if (FID != PreviousFID) { if (ChangeList) { if (!RangeOfFilesToPrint || LQU_NumberWithinRange( FileCount, RangeOfFilesToPrint )) { if (doOneFile( db, MatchCount - 1, ChangeList, PreviousFileName, PreviousFID, OutputDirectory ) < 0) { ThereWereProblems = 1; } } FreeChangeList(ChangeList); } ChangeList = (t_ChangeList *) 0; ChangeListNextp = &ChangeList; PreviousFID = FID; ++FileCount; if (PreviousFileName) { efree(PreviousFileName); } PreviousFileName = (char *) 0; } if (!PreviousFileName) { PreviousFileName = strdup(FileName); } { t_ChangeList *PreviousNext; if (!ChangeListNextp || BlockInFile < PreviousBlockInFile || (BlockInFile == PreviousBlockInFile && WordInBlock < PreviousWordInBlock) ) { /* Insert in the right place */ for (ChangeListNextp = &ChangeList; *ChangeListNextp; ChangeListNextp = &(*ChangeListNextp)->Next ) { if ((*ChangeListNextp)->BlockInFile > BlockInFile) { break; } if ((*ChangeListNextp)->BlockInFile == BlockInFile) { if ((*ChangeListNextp)->WordInBlock <= WordInBlock) { break; } } } } PreviousNext = (*ChangeListNextp); *ChangeListNextp = (t_ChangeList *) emalloc( "Match list item", sizeof(t_ChangeList) ); (*ChangeListNextp)->WordCount = NumberOfWords; (*ChangeListNextp)->BlockInFile = BlockInFile; (*ChangeListNextp)->WordInBlock = WordInBlock; if (leftSub) { (*ChangeListNextp)->OpenString = emalloc( "leftSub", strlen(leftSub) + 1 ); (void) strcpy((*ChangeListNextp)->OpenString, leftSub); } else { (*ChangeListNextp)->OpenString = DefaultLeftString; } if (leftSub) { (*ChangeListNextp)->CloseString = emalloc( "rightSub", strlen(rightSub) + 1 ); (void) strcpy((*ChangeListNextp)->CloseString, rightSub); } else { (*ChangeListNextp)->CloseString = DefaultRightString; } (*ChangeListNextp)->Next = (t_ChangeList *) PreviousNext; if (PreviousNext) { ChangeListNextp = (t_ChangeList **) 0; } else { ChangeListNextp = &(*ChangeListNextp)->Next; } } } if (ChangeList) { if (!RangeOfFilesToPrint || LQU_NumberWithinRange( FileCount, RangeOfFilesToPrint )) { if (doOneFile( db, MatchCount - 1, ChangeList, FileName, PreviousFID, OutputDirectory ) < 0) { ThereWereProblems = 1; } } FreeChangeList(ChangeList); } return ThereWereProblems ? -1 : MatchCount; } static char *theText = 0; PRIVATE int CompareOneChangeEntries(e1, e2) t_OneChangeList *e1, *e2; { int Result = (e1->Position - theText) - (e2->Position - theText); if (Result == 0) { if (e1->Where == e_After) { return (e2->Where == e_After) ? 0 : -1; } else { return (e2->Where == e_After) ? 1 : 0; } } else { return Result; } } PRIVATE int doOneFile(db, LineNumber, ChangeList, DocumentName, FID, OutputDirectory) t_LQTEXT_Database *db; int LineNumber; t_ChangeList *ChangeList; char *DocumentName; t_FID FID; char *OutputDirectory; { char *FileName; long theFileSizeInBytes; int fd; FILE *outputFile; char *outputFileName; register char *p; t_ChangeList *NextMatch = 0; long NumberOfMatches = 0L; t_OneChangeList *ChangeArray; long theMatch; FileName = LQT_FindFile(db, DocumentName); if (!FileName) { Error(E_WARN|E_SYS, "can't find file \"%s\"", DocumentName); return -1; } if ((fd = LQT_UnpackAndOpen(db, FileName)) < 0) { return -1; } /* how big is the file? */ { struct stat statBuf; if (fstat(fd, &statBuf) < 0) { Error(E_SYS|E_WARN, "Couldn't fstat(%d, %s) to get the file size", fd, statBuf ); (void) close(fd); return -1; } theFileSizeInBytes = statBuf.st_size; } /* read the file into memory */ theText = emalloc( FileName, theFileSizeInBytes + 1 ); { long tmp = read(fd, theText, theFileSizeInBytes); if (tmp != theFileSizeInBytes) { Error(E_SYS|E_WARN, "Expected %ld bytes from \"%s\" but got %ld instead", theFileSizeInBytes, FileName, tmp ); if (tmp <= 0) { (void) efree(theText); theText = (char *) 0; (void) close(fd); return -1; } theFileSizeInBytes = tmp; } } (void) close(fd); /* count the matches */ for (NextMatch = ChangeList; NextMatch; NextMatch = NextMatch->Next) { ++NumberOfMatches; } if (NumberOfMatches == 0) { Error(E_WARN|E_INTERNAL, "File %s: no matches for this file were counted -- skipped", DocumentName ); (void) efree(theText); theText = (char *) 0; return -1; } /* allocate an array of Changes */ ChangeArray = (t_OneChangeList *) emalloc( "Change Array", sizeof(t_OneChangeList) * (NumberOfMatches + 1) * 2 ); theMatch = 0; /* find all the matches */ for (NextMatch = ChangeList; NextMatch; NextMatch = NextMatch->Next) { t_OffsetPair *OffsetPair = 0; OffsetPair = LQT_FindMatchEnds( db, theText, theFileSizeInBytes, &theText[NextMatch->BlockInFile * LQT_FileBlockSize(db)], NextMatch->BlockInFile, NextMatch->WordInBlock, NextMatch->WordCount ); if (!OffsetPair) { Error(E_WARN, "Match %d (block %ld, word %lu, in %s) not found", LineNumber, NextMatch->BlockInFile, NextMatch->WordInBlock, DocumentName ); --NumberOfMatches; } else { /* save the values! */ ChangeArray[theMatch].Where = e_Before; ChangeArray[theMatch].Position = OffsetPair->Start; ChangeArray[theMatch].TextToInsert = NextMatch->OpenString; ChangeArray[theMatch].MatchNumber = (theMatch + 2) / 2; theMatch++; ChangeArray[theMatch].Where = e_Before; ChangeArray[theMatch].Position = OffsetPair->End; ChangeArray[theMatch].TextToInsert = NextMatch->CloseString; ChangeArray[theMatch].MatchNumber = (theMatch + 1) / 2; theMatch++; } } if (!NumberOfMatches) { Error(E_WARN, "File %s had no matches that could be found (files changed?)", DocumentName ); efree((char *) ChangeArray); efree(theText); theText = (char *) 0; return -1; } /* Now deal with half-matches: */ NumberOfMatches *= 2; /* sort the matches */ qsort( (char *) ChangeArray, NumberOfMatches, sizeof(t_OneChangeList), CompareOneChangeEntries ); /* create the output file */ if (STREQ(OutputDirectory, "-")) { outputFileName = "standard output"; outputFile = stdout; } else { if (*DocumentName == '/') { outputFileName = LQU_joinstr3(OutputDirectory, "", DocumentName); } else { outputFileName = LQU_joinstr3(OutputDirectory, "/", DocumentName); } outputFile = LQU_fEopen(E_WARN, outputFileName, "with matches marked", "w"); } if (!outputFile) { Error(E_WARN, "%s: couldn't create output file `%s' -- skipped", DocumentName, outputFileName ); efree((char *) ChangeArray); efree(theText); theText = (char *) 0; return -1; } LQT_Trace(LQTRACE_VERBOSE|LQTRACE_DEBUG, "processing %s to %s", DocumentName, outputFileName ); /* copy the output file, doing the substitutions */ theMatch = 0; for (p = theText; p - theText < theFileSizeInBytes; p++) { if (theMatch >= NumberOfMatches || p != ChangeArray[theMatch].Position) { putc(*p, outputFile); } else { int DonePutc = 0; while (p == ChangeArray[theMatch].Position) { if (!DonePutc && ChangeArray[theMatch].Where == e_After) { /* append the string after the character */ putc(*p, outputFile); DonePutc = 1; } if (!RangeOfMatchesToPrint || LQU_NumberWithinRange( ChangeArray[theMatch].MatchNumber, RangeOfMatchesToPrint )) { if (ChangeArray[theMatch].TextToInsert) { (void) fputs( ChangeArray[theMatch].TextToInsert, outputFile ); } } if (!DonePutc && ChangeArray[theMatch].Where == e_Before) { if (theMatch >= NumberOfMatches - 1 || ChangeArray[theMatch].Position != p) { putc(*p, outputFile); DonePutc = 1; } } ++theMatch; if (theMatch >= NumberOfMatches) { break; } } if (!DonePutc) { if (putc(*p, outputFile) == EOF) { Error(E_WARN|E_SYS, "%s: problem writing to output file \"%s\"", DocumentName, outputFileName ); efree((char *) ChangeArray); efree(theText); theText = (char *) 0; return -1; } } } } efree((char *) ChangeArray); if (outputFile != stdout) { if (fclose(outputFile) != 0) { Error(E_WARN|E_SYS, "Problem closing output file \"%s\"", outputFileName ); } } if (theText) { (void) efree(theText); theText = (char *) 0; } return 0; }