#! /bin/sh

# [1] where are the data files you want to index

# [2] where do you want to put the index

# [3] is there a pattern to use for document titles?

# [4] do you want to edit the readme file?

# [5] do you want to run some tests afterwards?

# [6] run lqaddfile

# [7] optional: run tests


# determine how to do echo with this shell:
N=;C='\c'; if test x"`echo -n hi`" = x"hi"; then N='-n'; C=; fi; export N C

CMDNAME=`basename $0`
export CMDNAME

tmp=/tmp/lqmi.$$
tmp2=/tmp/lqmi2.$$
trap '/bin/rm -f $tmp $tmp2; echo bye!; exit' 0 1 2 3 12

cat << boy
      We will make a text retrieval index to all the files
      in a directory that you will choose.  Files in subdirectories
      will also be indexed.

      You will be asked for the full path of the directory to index --
      files in that directory and any subdirectories will be indexed too.

      You'll then be asked where to put the index.

      You will be given a chance to edit the configuration file, and
      also to say whether tests should be run or not, and then you
      can leave the program running.

boy

# [1] where are the data files you want to index

dataDir=;
export dataDir
Pattern="-type f"
export Pattern

OK=no
while test $OK = no
do
    echo "[1] Enter the full name of a directory you want to index,"
    echo $N ": $C"
    read ans

    # check the answer
    case "$ans" in
    .)  dataDir=`pwd`
	;;
    /*) dataDir="$ans"
	;;
    *)  echo "** Please enter an absolute path, starting with a /"
	continue
	;;
    esac

    if test ! -d "$dataDir/."
    then
	echo "** Please enter the name of a directory that already exists!"
	echo "** Not a directory: $dataDir"
	continue
    fi

    Pattern="-type f"
    echo "If you like, you can enter a pattern such as *.txt to restrict"
    echo "the index to files that match the pattern"
    echo $N "Enter a pattern, or press return to index all files: $C"
    read ans
    case "$ans" in
    ?*) echo "** (index files matching $ans)"
	Pattern="-type f -name '$ans'"
	;;
    *)  echo "** (index all files beneath $dataDir)"
	;;
    esac
    echo $N "   OK? [yes/no]: $C"
    read ans
    case "$ans" in
    y*) OK=yes
	;;
    *)  OK=no
	;;
    esac

    echo

done

# [2] is there a pattern to use for document titles?

TitlePattern=
export TitlePattern

OK=no
while test $OK = no
do
    echo "If there is a pattern we can use with Unix sed to find"
    echo "a title for each file, we can do that.  The titles will be"
    echo "displayed automatically when documents are retrieved."
    echo
    echo 'You can surrond part of the pattern with \(...\) to use only'
    echo "that part as the title of each document."
    echo "The first line that matches the pattern will be used."
    echo
    echo "Or, just press return to use the filename as the title."
    echo "(you can edit the config files later to change this)"
    echo 'Example (mail, news): ^Subject: *\(.*\)$'
    echo 'Example (HTML): ^<[tT][iI][tT][lL][eE]>\(.*\)</'
    echo $N "   pattern: $C"
    # use awk rather than read, because read throws away \ characters:
    awk '{print; exit}' > $tmp
    ans=`cat $tmp`
    case "$ans" in
    "")
	echo "** Use file names as document titles"
	;;
    *?)
	echo "** look for $ans in each file"
	# put \(...\) round the pattern if not there, and also
	# escape any / characters:
	cat > $tmp2 << 'sedscript'
s@/@\\&@g
/\\[(].*\\[)]/!s/^.*$/\\(&\\)/
s/\\[(]^/^\\(/
s/[$]\\)/\\)$/
sedscript
	TitlePattern=`sed -f $tmp2 $tmp`

	echo "  (using sed -e '/${TitlePattern}/s//\\1/p')"
	echo "If you would like to try out the pattern, enter the number"
	echo "of titles to be generated.  Press return for none."
	echo $N "   Number of samples? [all,1,2,12,etc.]: $C"
	read ans
	if test x"$ans" = x"all"
	then
	    ans=9999999
	fi
	export ans Pattern
	case "$ans" in
	[0-9]|[0-9][0-9]|[0-9]*[0-9])
	    (
		cd $dataDir
		eval find . $Pattern -print | sed "${ans}q" | while read f
		do
		    set +x
		    echo $N "$f$C"
		    T=`sed -n "/$TitlePattern/{
			s//\\1/
			p
			q
		    }" $f`
		    case "$T" in
		    "")  echo "[pattern not matched]" ;;
		    *)   echo " [$T]" ;;
		    esac
		done
	    ) 
	    ;;
	*)
	    echo "(no samples generated)"
	    ;;
	esac
	;;
    esac

    echo $N "   OK? [yes/no]: $C"
    read ans
    case "$ans" in
    y*) OK=yes
	;;
    *)  OK=no
	;;
    esac

    echo

done

# [3] where do you want to put the index

LQTEXTDIR=;
export LQTEXTDIR

OK=no
while test $OK = no
do
    echo "You should now choose a directory to contain the index."
    echo "If the directory you give does not exist, LQTEXTDIR will be"
    echo "created in that directory."
    echo "You can also enter ls to see all the files that will be"
    echo "indexed, or du to run the du command in the data directory,"
    echo "or df to run the df command."
    echo $N "   [Full path/du/ls/df] $C"
    read ans
    case "$ans" in
    du)
	(
	    cd $dataDir
	    du
	)
	continue
	;;
    ls)
	# this would be better if it could use xargs, but I want portability.
	# ls -s | rs would be good, too.
	(
	    cd $dataDir
	    eval find . $Pattern -print |
		sed -e 's/^\.\///' -e 's/^.*$/ls -l "&"/' |
		sh
	)
	continue
	;;
    df)
	df
	continue
	;;
    /*)
	LQTEXTDIR="$ans"
	if test -d "$LQTEXTDIR/."
	then
	    if test -f "$LQTEXTDIR/README" &&
		grep -s -i docpath "$LQTEXTDIR/README"
	    then # it is already an lq-text dir
		echo "** Note: $LQTEXTDIR may already contain an index..."
	    else
		echo "** (directory is already there; adding LQTEXTDIR)"
		LQTEXTDIR="$LQTEXTDIR/LQTEXTDIR"
		if test -d "$LQTEXTDIR/."
		then
		    echo "** Note: $LQTEXTDIR may already contain an index..."
		fi
	    fi
	else
	    # not there
	    # avoid non-portable dirname command:
	    parent=`echo "$LQTEXTDIR" | sed -e 's@/[^/]*$@@'`
	    if test ! -d "$parent/."
	    then
		echo "** Not a directory: $parent"
		continue
	    fi
	fi

	echo "   Store the index in directory: $LQTEXTDIR"
	echo $N "   OK? [yes/no]: $C"
	read ans
	case "$ans" in
	y*) OK=yes
	    ;;
	*)  OK=no
	    ;;
	esac
	;;
    *)
	echo "** Please enter a full path, starting with /, or a command"
	;;
    esac

    echo

done


# [4] do you want to run some tests afterwards?
doTests=no
export doTests
OK=no
while test $OK = no
do
    echo "After making the index, you can have some tests run"
    echo "to check that it was OK.  This may take some time."
    echo $N "   Run tests? [yes/no] $C"
    read ans
    case "$ans" in
    y*) OK=yes
	doTests=yes
	;;
    *)  OK=yes
	doTests=no
	;;
    esac
done

# [5] do you want to edit the readme file?
editConfig=
export editConfig
OK=no
while test $OK = no
do
    echo "If you want, you can edit the two database configuration files"
    echo "before ths index is made.  You do not have to change anything"
    echo "unless you want to."
    echo $N "   edit config files? [yes/no] $C"
    read ans
    case "$ans" in
    y*) OK=yes
	editConfig="${VISUAL-${EDITOR-vi}} ${LQTEXTDIR}/README ${LQTEXTDIR}/do-index"
	;;
    *)  OK=yes
	editConfig=;
	;;
    esac
done

echo "** generating configutation file, please wait"

if test ! -d "$LQTEXTDIR"
then
    mkdir "$LQTEXTDIR" || {
	echo "$CMDNAME: sorry, unable to make directory $LQTEXTDIR"
	exit 1
    }
fi

cat > $LQTEXTDIR/README << boy
# Configuration file for an lq-text database
# generated automatically by $CMDNAME on `date`
# Data directory: $dataDir

# If you want to add more directories later, put them on the
# next line with a : between them, e.g.
# $dataDir:${HOME-/usr/doc}/etexts
docpath $dataDir

# By default, every word in the input is indexed; if you
# decide you don't want to index and, the, if, but, etc.,
# you can put the words you want ignored into the file
# the $LQTEXTDIR/stoplist.txt
# one per line.  See the comments in that file.
common stoplist.txt

# Words of only one letter (I, a) are not indexed by default:
minwordlength 2

# Words are truncated if they are longer than this.  This saves a small
# amount of space in the index, and gives a minor speedup.  The maximum
# value depends on how lq-text was compiled, but is normally 20.
maxwordlength 18

# Words consisting entirely of numbers, e.g 1995, can be ignored, or
# you can include them in the index: set indexnumbers to "on" or "off":
indexnumbers off

# The wordlist contains one copy of each distinct word in the index.
# If you are really tight on space, you can set this to off.
# The default is to set it to on.
wordlist on

end
# anything after here is ignored.

boy

cat > $LQTEXTDIR/stoplist.txt << boy
# This file can contain words, one per line, that should not be
# put into the index.
# The words must be in lower case.
boy

cat > $LQTEXTDIR/do-index << boy
#! /bin/sh

LQTEXTDIR="$LQTEXTDIR"
export LQTEXTDIR

cd $dataDir || {
    echo "$0: could not find data directory!" 1>&2
    exit 1
}

# remove existing index, if any:
lqclean -f

find * $Pattern -print | lqaddfile -f - -w300000

# generate the titles

# The titles file should have
# a number, a tab, then a title, for every document, and must
# be sorted in increasing numerical order.

boy

PREFIX=""

if test -z "$TitlePattern"
then
    echo "# If you want to generate document titles,"
    echo "# remove the ## from the following lines, and edit the pattern,"
    echo "# or write a replacement script."

    PREFIX="## "
fi >> $LQTEXTDIR/do-index


cat >> $LQTEXTDIR/do-index << boy
${PREFIX}echo "# titles generated automatically on `date`" > $LQTEXTDIR/titles
${PREFIX}
${PREFIX}lqfile -al | 
${PREFIX}while read FID type month day time year filename
${PREFIX}do
${PREFIX}    T=\`sed -n "/$TitlePattern/{
${PREFIX}	s//\\1/
${PREFIX}	p
${PREFIX}	q
${PREFIX}    }" \$filename\`
${PREFIX}    case "\$T" in
${PREFIX}    "") echo "\$FID	\$T" ;;
${PREFIX}    *)  echo "\$FID	\$filename" ;;
${PREFIX}    esac
${PREFIX}done >> $LQTEXTDIR/titles

# end of (optional) title generation.
boy

if test $doTests = yes
then
    cat >> $LQTEXTDIR/do-index << boy


# some simple tests

echo "**** Running tests"

# the following fetches every word, to see if there was any problem
# writing the index:
lqword -Al > /dev/null

boy

fi # endif $doTests
chmod +x $LQTEXTDIR/do-index


# edit the config files:

if test ! -z "$editConfig"
then
    echo $N "** press return to edit the configuration files...$C"
    read ans
    eval $editConfig
fi


# [6] run lqaddfile

echo $N "Go ahead and run the index? [yes/no/bg] $C"
read ans
case "$ans" in
y*) cd $LQTEXTDIR
    ./do-index
    ;;
bg) cd $LQTEXTDIR
    ./do-index &
    ;;
*)
    echo "You can run the index at any time later by"
    echo "cd $LQTEXTDIR"
    echo "./do-index"
    ;;
esac

