/** HTML.c
 ** $Id$
 ** Copyright Liam Quin 1996; All Rights Reserved.
 **
 ** Read and index an HTML file.
 **
 ** Ignore elements;
 ** ignore attributes
 ** ignore comments.
 **   TODO: index ALT attributes
 **
 ** It is _not_ a goal to parse the HTML file in any useful sense,
 ** but only to identify HTML elements in order to ignore them.
 **
 ** In addition, the contents of the following elements are ignored:
 ** SCRIPT
 ** NOFRAMES
 ** STYLE
 **/

typedef enum {
    ps_pcdata,
    ps_tagstart,
} t_ParserState;

typedef struct {
    t_ParserState currentState;
    int ch;
    t_ParserState newState;
} t_HTMLState;

t_HTMLState StateTable[] = {
    { ps_pcdata, '<', ps_tagstart },
    /* { ps_pcdata, '&', ps_entity }, NOTDONE */
    { ps_tagstart, '!', ps_markup },
    { ps_tagstart, ' ', ps_pcdata },
    { ps_tagstart, '/', ps_element }, /* end tag */
    { ps_tagstart, ANY, ps_element },

    { ps_markup, '"', ps_markup_string },
	{ ps_markup_string, '"', ps_markup },
    { ps_markup, '\'', ps_markup_string2 },
	{ ps_markup_string2, '\'', ps_markup },
    { ps_markup, '-', ps_maybecomment },
	{ ps_maybecomment, '-', ps_comment },
	{ ps_maybecomment, ANY|REJECT, ps_markup },
    { ps_markup, ANY, ps_markup },

    { ps_element, '>', pcdata },
    { ps_element, '"', ps_markup_string },
    { ps_element, '\'', ps_markup_string2 },
	/* this means that at the end of the string we're in
	 * markup mode, but that seems to be OK.
	 */

#if 0
    { ps_entity, ';', pcdata },
    { ps_entity, ' ', pcdata },
    { ps_entity, '<'|REJECT, pcdata },
    { ps_entity, ANY, ps_entity },
#endif

    { ps_comment, '-', ps_maybeendcomment },
	{ ps_maybeendcomment, '-', ps_markup },
	{ ps_maybeendcomment, ANY_REJECT, ps_comment },
    { ps_comment, ANY, ps_comment },

    
    

};
