/* matchend.c -- Copyright 1993, 1994 Liam R. Quin. All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. */ /* matchend -- find the start & end bytes of a match * Liam Quin, September 1993 and later... * * $Id: matchend.c,v 1.15 2001/05/31 03:50:13 liam Exp $ */ #include "error.h" #include #include #include #include #include "globals.h" /* defines and declarations for database filenames */ #ifdef HAVE_STRING_H # include #else # include #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "wordrules.h" #include "emalloc.h" #include "addfile.h" #include "liblqtext.h" #include "lqtrace.h" #include "lqutil.h" #include "wmoffset.h" /** System calls and library routines used in this file: **/ /** System calls: **/ /** Library Functions: **/ /** Functions within this file that need declaring: **/ /** **/ /**/ /* * LQT_FindMatchEnds * Database/Retrieval, Database/Documents * *

Returns pointers to the start and end of the matched text in the * given buffer. LQT_FindMatchEnds must be called with at least one * block of data (FILEBLOCKSIZE in globals.h, usually 64 bytes) * either side of the block containing the match. * Providing more blocks before the * matched block is more likely to result in a correct return value, * as there are some special cases involving words spanning block * boundaries that are best dealt with by looking a block further * back until a block boundary is found that has a space to one side * of it, and LQT_FindMatchEnds does this.

The Buffer argument is the text from the file, with StartBlock * being a pointer to the first character in the block containing * the match. The BIF and WIB arguments are the Block In File and * Word In Block fields from the match, and the NumberOfWords argument * determines the number of words in the match, for setting the * match end pointer. * * *

a t_OffsetPair on success, containing pointers to the * first matched character and the last matched character. *

zero if the match wasn't found * * * LQT_ReadWordFromStringPointer * */ API t_OffsetPair * LQT_FindMatchEnds(db, Buffer, Length, StartBlock, BIF, WIB, NumberOfWords) t_LQTEXT_Database *db; char *Buffer; unsigned int Length; char *StartBlock; unsigned long BIF; unsigned long WIB; int NumberOfWords; { static t_OffsetPair Result; int WordsSeen = 0; char *End = &Buffer[Length]; /* Find the start and end of a match */ char *p; /* not register, we take its address */ (void) LQT_ReadWordFromStringPointer(db, (char **) 0, (char **) 0, 0, 0); p = StartBlock; /* This is complicated because if a word crosses a block * boundary, we would see just the end of it and think it * the first word in the block. * Hence, if it goes backwards, we'll skip over it. */ #define PossibleWordCharacter(c) \ (LQT_EndsWord(db, c) || LQT_StartsWord(db, (c)) || isdigit(c) || \ LQT_OnlyWithinWord(db, c)) if ( p > Buffer && !isspace(*p) && !isspace(p[-1]) && PossibleWordCharacter(*p) && PossibleWordCharacter(p[-1]) ) { /* There was a word spanning the block boundary. * Look at the previous block, and see if we can handle that more * easily: */ register int bs = LQT_FileBlockSize(db); if ( (BIF == 1) || ( (p - Buffer > bs) && ( isspace(p[ -bs ]) || isspace(p[-(bs - 1)]) || !PossibleWordCharacter(p[-bs]) || !PossibleWordCharacter(p[-(bs - 1)]) ) ) ) { char *q; t_WordInfo *WP = 0; char *Start; /* Yes, there's a word boundary at the start of the * previous block */ Start = q = &p[-bs]; /* find the start of the block */ do { if (Start >= End) { break; } WP = LQT_ReadWordFromStringPointer(db, &q, &Start, End, 0); /* 0=don't ignore common words TODO FIXME */ if (WP && WP->WordPlace.BlockInFile > 0) { break; } } while (WP && Start < p); if (WP) { p = Start; /* or q? */ } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_FINDMATCH)) { (void) fflush(stderr); (void) fflush(stdout); printf("<<1>>"); } #endif } else { char *q; t_WordInfo *WP; char *Start = p; for (q = p; q > Buffer; q--) { if (isspace(*q) /* || !PossibleWordCharacter(*q)*/) break; } do { WP = LQT_ReadWordFromStringPointer(db, &q, &Start, End, 0); } while (WP && Start < p); if (WP) { p = Start; } #ifdef ASCIITRACE if (LQT_TraceFlagsSet(LQTRACE_FINDMATCH)) { (void) fflush(stderr); (void) fflush(stdout); printf("<<1>>"); } #endif } /* initialise again */ (void) LQT_ReadWordFromStringPointer(db, (char **) 0, 0, 0, 0); } Result.Start = Result.End = (char *) 0; for (;;) { t_WordInfo *WP; if (!p || p - Buffer >= Length) { return (t_OffsetPair *) 0; } WP = LQT_ReadWordFromStringPointer( db, &p, &Result.Start, End, LQT_READWORD_IGNORE_COMMON ); if (!WP) { return (t_OffsetPair *) 0; } if (WP->WordPlace.WordInBlock >= WIB) { break; } } /* We have found the first word in the match; * now find the end of the last word: */ WordsSeen = 1; while (p && *p && WordsSeen < NumberOfWords) { t_WordInfo *WP; WP = LQT_ReadWordFromStringPointer(db, &p, (char **) 0, End, 0); if (!WP) break; ++WordsSeen; } Result.End = --p; return &Result; }