/* lqwordlist.c -- Copyright 1989, 1994, 1995 Liam R. Quin. * All Rights Reserved. * This code is NOT in the public domain. * See the file COPYRIGHT for full details. * * lqwordlist -- print sorted wordlist * * $Id: lqwordlist.c,v 1.11 2001/05/31 03:50:13 liam Exp $ */ /* If you have problems compiling this file, uncomment the following line: */ /* #define NO_REGEXP 1 */ #include "globals.h" /* defines and declarations for database filenames */ #include "error.h" #include #include #include #ifdef HAVE_STRING_H # include #else # include #endif #include /* for SEEK_SET */ #ifdef BSD # define USI_MAX ((unsigned int) -1) #else # include /* for USI_MAX, the largest unsigned integer. * 4.3 BSD doesn't seem to have this. I don't know how to get this * on BSD systems. */ #endif #ifdef HAVE_STDLIB_H # include #else # include #endif #ifndef O_RDONLY # ifdef HAVE_FCNTL_H # include /* required on some systems */ # endif # include #endif #include "fileinfo.h" #include "wordinfo.h" #include "smalldb.h" #include "pblock.h" #include "wordrules.h" #include "numbers.h" #include "emalloc.h" #include "lqutil.h" #include "liblqtext.h" /*** Declarations: ***/ /** System calls and library routines: **/ extern void exit(); /** Unix Library Functions: **/ /** functions defined within this file: */ PRIVATE void DumpMyCache( #ifdef HAVE_PROTO /* empty */ #endif ); PRIVATE void AddSort( #ifdef HAVE_PROTO int Length, char *Word, t_WID WID, unsigned long TotalCount #endif ); PRIVATE void indexmarch( #ifdef HAVE_PROTO t_LQTEXT_Database *db #endif ); #ifndef NO_REGEXP PRIVATE int WantEgrep( #ifdef HAVE_PROTO int Length, char *Word, t_WID TotalCount #endif ); #endif PRIVATE int WantSubString( #ifdef HAVE_PROTO int Length, char *Word, t_WID TotalCount #endif ); PRIVATE int WantPrefix( #ifdef HAVE_PROTO int Length, char *Word, t_WID TotalCount #endif ); PRIVATE int WantSuffix( #ifdef HAVE_PROTO int Length, char *Word, t_WID TotalCount #endif ); PRIVATE int InitCache( #ifdef HAVE_PROTO long MaxWords #endif ); static int (* WantWord)( #ifdef HAVE_PROTO int Length, char *Word, t_WID TotalCount #endif ) = WantSubString; /** Macros and variable definitions **/ /* for more speed: */ static int CanOptimise = 0; static unsigned char Opt_First = 0; char *progname = 0; /* Used for error messages */ char *Prefix = NULL; int PrefixLength = 0; static int DoingSort = 1; /* sort on by default */ static int PrintFrequency = 0; static int WantWIDS = 0; static char *Revision = "$Revision: 1.11 $"; /** end of declarations... **/ int main(argc, argv) int argc; char *argv[]; { extern int optind, getopt(); /* For getopt(3) */ extern char *optarg; /* For getopt(3) */ int ch; /* For getopt(3) */ int ErrorFlag = 0; /* For getopt(3) */ t_LQTEXT_Database *db; /* The database we'll read from */ t_lqdbOptions *Options; /* The user's configuration & preferences */ progname = argv[0]; /* I see this as a library program, so I am leaving the full * path. lqaddfile(1L) and lqphrase(1L) set progname to be * the filename of the command, rather than the full pathname. */ Options = LQT_InitFromArgv(argc, argv); /* Deal with any arguments that are understood by all lqtext * programs. */ while ((ch = getopt(argc, argv, "egnpsuVWvwxZz:")) != EOF) { switch (ch) { case 'e': WantWord = WantSuffix; break; case 'g': #ifdef NO_REGEXP Error(E_FATAL|E_USAGE, "-g not supported on this system, sorry" ); #else WantWord = WantEgrep; #endif break; case 'n': PrintFrequency = 1; break; case 'p': WantWord = WantPrefix; break; case 's': DoingSort = 1; /* the default */ break; case 'u': DoingSort = 0; break; case 'V': fprintf(stderr, "%s version %s\n", progname, Revision); break; case 'W': WantWIDS = 1; break; case 'w': Prefix = optarg; PrefixLength = strlen(Prefix); break; case 'x': ErrorFlag++; break; case '?': ErrorFlag++; break; case 'z': case 'Z': break; /* done by LQT_InitFromArgv(); */ } } if (ErrorFlag) { fprintf(stderr, "%s: options are:\n", progname); fprintf(stderr, "%s\n%s\n%s\n%s\n%s\n%s\n", "-e suffix -- only print words ending with \"suffix\"", "-p prefix -- only print words starting with \"prefix\"", "-g pat -- only print words matching egrep pattern \"pat\"", "-n -- print the number of times each word occurs", "-s -- print the words in sorted order", "-u -- print the words in unsorted order [the default]" ); LQT_PrintDefaultUsage(Options); /* LQT_PrintDefaultUsage() prints the list of the standard options. */ exit(1); } db = LQT_OpenDatabase(Options, O_RDONLY, 0); InitCache(LQT_GetMaxWID(db)); if (Prefix) { indexmarch(db); DumpMyCache(); } while (optind < argc) { Prefix = argv[optind]; PrefixLength = strlen(argv[optind]); indexmarch(db); DumpMyCache(); ++optind; } LQT_CloseDatabase(db); exit(0); /*NOTREACHED*/ return(0); /* for lint and gcc -Wall*/ } static char *DefaultCache[10]; static long MaxInCache = 10; static char **MyCache = DefaultCache; static long CacheCount = 0; PRIVATE int InitCache(MaxWords) long MaxWords; { MaxInCache = sizeof(DefaultCache[0]) / sizeof(DefaultCache); if (MaxWords < MaxInCache) { MyCache = DefaultCache; return 0; } MyCache = (char **) malloc((MaxWords + 2) * sizeof(char *)); if (!MyCache) { Error(E_FATAL|E_MEMORY, "Couldn't callocate %ld bytes of memory for wordlist", (MaxWords + 2) * sizeof(char *)); exit(1); } MaxInCache = MaxWords + 1; /* * fprintf(stderr, "Init cache %ld Max set to %ld\n", MaxWords, MaxInCache); */ return 0; } PRIVATE unsigned char * GetEntry(Name, f, n) char *Name; int f; /* Unix file descriptor */ t_WID n; { static unsigned char Buf[WIDBLOCKSIZE * 1024]; static unsigned long StartPos = 0L; static unsigned long EndPos = 0L; unsigned long Where = n * WIDBLOCKSIZE; if (Where + WIDBLOCKSIZE > EndPos + 1 || Where < StartPos) { int i; (void) LQU_Elseek(E_FATAL, Name, "Word Index File", f, Where, SEEK_SET); StartPos = Where; if ((i = read(f, (char *) Buf, sizeof Buf)) < 0) { Error(E_FATAL|E_SYS, "WIDINDEX read(fd=%d, buf, n=%d) failed", f, sizeof Buf ); } EndPos = Where + i; } return &Buf[Where - StartPos]; } PRIVATE void indexmarch(db) t_LQTEXT_Database *db; { int fd; t_WID WID; t_WID MaxWid = LQT_GetMaxWID(db); unsigned long Offset; fd = LQU_Eopen(E_FATAL, db->WidIndexFile, "Word Index File", O_RDONLY, 0); for (WID = 0; WID < MaxWid; WID++) { unsigned char *Block = GetEntry(db->WidIndexFile, fd, WID); unsigned long Len; unsigned long TotalCount; unsigned char *p; unsigned char *Word; Word = Block; /* first number is word length: */ (void) LQT_sReadNumber(&Word, &Len, Block, WIDBLOCKSIZE); p = &Word[Len]; if (CanOptimise && Opt_First != *Word) { continue; } (void) LQT_sReadNumber(&p, &Offset, Block, WIDBLOCKSIZE); /* total places */ #ifdef LQ_1_12_COMPAT (void) LQT_sReadNumber(&p, &TotalCount, Block, WIDBLOCKSIZE); #else /* p[0] is the least significant byte. What happened to PUT4/GET4? */ if (Offset != 0L) { unsigned long L; #define ui(x) ((unsigned int)(x)) L = ui(p[3] & 255); L <<= 8; L |= ui(p[2] & 255); L <<= 8; L |= ui(p[1] & 255); L <<= 8; L |= ui(p[0] & 255); TotalCount = L; p += 4; } else { (void) LQT_sReadNumber(&p, &TotalCount, Block, WIDBLOCKSIZE); } #endif if (TotalCount == 0L) { continue; /* the word was deleted */ } /* TODO: generate all occurring word forms */ if ((* WantWord)(Len, Word, TotalCount)) { AddSort((int) Len, Word, WID, TotalCount); } } if (close(fd) < 0) { Error(E_WARN, "error whilst closing file %d=\"%s\"", fd, db->WidIndexFile ); } DumpMyCache(); } PRIVATE int WantSubString(Length, Word, TotalCount) int Length; char *Word; t_WID TotalCount; { if (Length >= PrefixLength) { register char *p; for ( p = Word; p - Word <= Length && (Length - (p - Word)) >= PrefixLength; p++ ) { if (STRNCMP(p, Prefix, PrefixLength) == 0) return 1; } } return 0; /* not wanted */ } PRIVATE int WantPrefix(Length, Word, TotalCount) int Length; char *Word; t_WID TotalCount; { if (Prefix) { return (Length >= PrefixLength) && (STRNCMP(Prefix, Word, PrefixLength) == 0); } return 0; /* not wanted */ } PRIVATE int WantSuffix(Length, Word, TotalCount) int Length; char *Word; t_WID TotalCount; { /* Note: Word is not nul terminated */ if (Length >= PrefixLength) { return STRNCMP(Prefix, &Word[Length - PrefixLength], PrefixLength) == 0; } return 0; /* not wanted */ } PRIVATE void AddSort(Length, Word, WID, TotalCount) int Length; char *Word; t_WID WID; unsigned long TotalCount; { if (PrintFrequency && TotalCount == 0L) { Error(E_WARN|E_INTERNAL, "Word %*s occurs zero times, I think", Length, Word ); return; } if (!DoingSort) { fwrite(Word, Length, 1, stdout); if (WantWIDS) { printf("\t%ld", WID); } if (PrintFrequency) { printf("\t%ld", TotalCount); } putchar('\n'); return; } if (CacheCount >= MaxInCache) { DumpMyCache(); CacheCount = 1; /* including this word... */ } if ((MyCache[CacheCount] = (char *) malloc(Length + 12)) == (char *) 0) { Error(E_FATAL|E_MEMORY, "malloc for %d bytes failed", Length + 12); exit(1); } (void) strncpy(MyCache[CacheCount], Word, Length); MyCache[CacheCount][Length] = '\0'; if (PrintFrequency) { char buf[20]; (void) sprintf(buf, "\t%ld", TotalCount); (void) strcat(MyCache[CacheCount], buf); } ++CacheCount; } int CompareStringsByPointersForQsort(s1p, s2p) void *s1p; void *s2p; { return STRCMP(*(char **)s1p, *(char **)s2p); } PRIVATE void DumpMyCache() { extern void qsort(); register int i; if (!CacheCount) return; qsort( &MyCache[0], CacheCount, (int) sizeof(char *), CompareStringsByPointersForQsort ); for (i = 0; i < CacheCount; i++) { /** printf("%d\t%s\n", i, MyCache[i]); **/ (void) puts(MyCache[i]); (void) free(MyCache[i]); MyCache[i] = (char *) 0; } CacheCount = 0; } /* Porting: if this causes problems, compile with -DNO_REGEXP... */ #ifndef NO_REGEXP static char * Message(val) int val; { switch (val) { case 11: return "Range endpoint too large [???]"; case 16: return "Bad number [????]"; case 25: return "``\\digit'' out of range - max is probably 9"; case 36: return "Illegal or missing delimiter"; case 41: return "No remembered search string"; case 42: return "\\( \\) imbalance"; case 43: return "Too many \\( -- max is 9"; case 44: return "More than 2 numbers given in \\{min,max\\}"; case 45: return "} expected after \\ in \\{min,max\\}"; case 46: return "First number exceeds second in \\{min,max\\}"; case 49: return "[] imbalance; use \\[ and \\] to match brackets"; case 50: return "Regular expression too long"; default: return "[no pre-defined error message, see man 3 regexp]"; } } #include /* Henry Spenser's regex.h, renamed to avoid conflicts */ static regex_t *Expression = 0; static void e(regErrorNumber) int regErrorNumber; { Error(E_FATAL, "Regexp error %d [%s] in `\"%s\"", regErrorNumber, Message(regErrorNumber), Prefix ); } static regex_t * Compile(Pattern) char *Pattern; { extern char *compile(); register char *pp = Pattern; int i; regex_t *Result; Result = (regex_t *) emalloc(Pattern, sizeof(regex_t)); i = regcomp( Result, Pattern, REG_EXTENDED|REG_NOSUB ); if (i != 0) { Error(E_FATAL|E_MULTILINE, "pattern compilation into internal form failed:" ); Error(E_FATAL|E_LASTLINE, "\"%s\": %s", Pattern, Message(i) ); } /* precompute some optimisations */ if (*pp == '^') { ++pp; if (isalnum(*pp)) { Opt_First = *pp; pp++; if (!*pp || isalnum(*pp)) { CanOptimise = 1; } } } return Result; } PRIVATE int WantEgrep(Length, Word, TotalCount) register int Length; char *Word; t_WID TotalCount; { regmatch_t pmatch[2]; int i; if (!Expression) { Expression = Compile(Prefix); } pmatch[0].rm_so = 0; pmatch[0].rm_eo = Length; i = regexec( Expression, Word, (size_t) 0, pmatch, REG_STARTEND ); switch(i) { case REG_NOMATCH: return 0; /* not wanted */ case 0: return 1; /* yes please */ default: Error(E_FATAL, "Runtime regular expression error: %s", Message(i) ); } /*NOTREACHED*/ return 0; } #endif /* !NO_REGEXP */