package com.mindprod.compactor;

import static java.lang.System.err;
import java.util.regex.Pattern;

/**
 * <p/>
 * Collapses multiple spaces in HTML text, tags and comments to one.
 * <p/>
 * Trims space from start and end of line.
 * <p/>
 * Removes whitespace after <dt...><h?...><li..><td...>
 * <p/>
 * Removes whitespace before </dt></h?></li></td>
 * <p/>
 * Leaves whitespace alone in <pre>...</pre>
 * <p/>
 * Leaves whitespace alone inside "..." in tags.
 * <p/>
 * Normalises newlines to \n.
 * <p/>
 * If  there is whitespace before, or after a comment or a between multiple comments it will be collapsed to a single
 * space or NL. Macro comments will not remove whitespace entirely before or after. They expand to text, so that
 * whitespace is significant.
 * <p/>
 * We emit NLs when we first see one, and avoid emitting subsequent NLs. However, we procrastinate emitting space until
 * we find the end of the space string. That way we can often eliminate the spaces altogether, replacing it with an NL.
 *
 * @author Roedy Green
 * @version 2.6 2008-02-28 optionally allow comments to be stripped out entirely. Preserve some space around
 *          configurable magic macro comments that expand into text such as <!# SSI or <!-- macro.
 *          <p/>
 *          <p/>
 *          Created by IntelliJ IDEA.
 */
enum HTMLState
    {
        IN_COMMENT/* inside <!-- ... --> we jump ahead when hit <!-- and --> .  Treat much like PRE */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                        case END:
                        case QUOTE:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_COMMENT;
                        case DASH:
                            if ( lookAhead( 2 ).equals( "->" ) )
                                {
                                // cheat, process 2 extra chars without using state machine
                                charIndex += 2;
                                emit.append( "-->" );
                                // pick up where we left off as if the comment never happened.
                                return previousTextState;
                                }
                            else
                                {
                                emit.append( '-' );
                                return IN_COMMENT;
                                }
                        case IGNORE:
                            return IN_COMMENT;
                        case NL:
                            lineNumber++;
                            emit.append( '\n' );
                            return IN_COMMENT_DEAD_SPACE;
                        case SPACE:
                            return IN_COMMENT_COMPACTIBLE_SPACE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},

        IN_COMMENT_COMPACTIBLE_SPACE/* inside spaces in a comment */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                        case END:
                        case QUOTE:
                        case TEXT:
                            emit.append( ' ' );
                            emit.append( nextChar );
                            return IN_COMMENT;
                        case IGNORE:
                        case SPACE:
                            return IN_COMMENT_COMPACTIBLE_SPACE;
                        case NL:
                            // we don't suppress NLs inside comments.
                            lineNumber++;
                            emit.append( '\n' );
                            return IN_COMMENT_DEAD_SPACE;
                        case DASH:
                            if ( lookAhead( 2 ).equals( "->" ) )
                                {
                                // cheat, process 2 extra chars without using state machine
                                charIndex += 2;
                                emit.append( " -->" );
                                // pick up where we left off as if the comment never happened.
                                return previousTextState;
                                }
                            else
                                {
                                emit.append( " -" );
                                return IN_COMMENT;
                                }
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},

        IN_COMMENT_DEAD_SPACE/* inside spaces in a comment, after a newline leading on a line, will be totally deleted. */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                        case END:
                        case QUOTE:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_COMMENT;
                        case DASH:
                            if ( lookAhead( 2 ).equals( "->" ) )
                                {
                                // cheat, process 2 extra chars without using state machine
                                charIndex += 2;
                                emit.append( "-->" );
                                // pick up where we left off as if the comment never happened.
                                return previousTextState;
                                }
                            else
                                {
                                emit.append( '-' );
                                return IN_COMMENT;
                                }
                        case IGNORE:
                        case NL:
                        case SPACE:
                            return IN_COMMENT_DEAD_SPACE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},

        IN_DEAD_SPACE/* inside lead spaces on line of ordinary text, or after <td>....
         Whitespace that will disappear entirely */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            tagCategory =
                                    // allow for / and >, < already parsed.
                                    TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                        2 ) ) );
                            switch ( tagCategory )
                                {
                                case COMMENT:
                                    charIndex += "!--".length();
                                    // Will be further later incremented by 1 by charIndexLoop
                                    // Record what we were doing so we can pick up where we left off after comment.

                                    if ( isMacroComment() )
                                        {
                                        // we always keep comment,  preserving any trailing whitespace.
                                        // is no lead whitespace
                                        emit.append( "<!--" );
                                        previousTextState = IN_TEXT;
                                        return IN_COMMENT;
                                        }
                                    else if ( keepComments )
                                        {
                                        emit.append( "<!--" );
                                        previousTextState = IN_DEAD_SPACE;
                                        return IN_COMMENT;
                                        }
                                    else
                                        {
                                        previousTextState = IN_DEAD_SPACE;
                                        return STRIPPING_COMMENT;
                                        }

                                case LEFT_TRIM:
                                    undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                    emit.append( '<' );
                                    return IN_TAG;
                                case INVALID:
                                    err.println( "Warning: < corrected to &lt;" + where() );
                                    emit.append( "&lt;" );
                                    return IN_TEXT;
                                case PLAIN:
                                case PRE:/* don't go into IN_PRE until end of <pre> */
                                case RIGHT_TRIM:
                                    emit.append( '<' );
                                    return IN_TAG;
                                case SLASH_PRE:
                                    err.println( "Error: </pre> unbalanced" + where() );
                                    emit.append( '<' );
                                    return IN_TAG;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case END:
                            // should have been entity, treat as text
                            err.println( "Warning: > corrected to &gt;" + where() );
                            emit.append( "&gt;" );
                            return IN_TEXT;
                        case DASH:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_TEXT;
                        case IGNORE:
                        case NL:
                        case SPACE:
                            // we remove empty lines.
                            return IN_DEAD_SPACE;
                        case QUOTE:
                            err.println( "Warning: \" corrected to &quot;" + where() );
                            // ignore lead space
                            emit.append( "&quot;" );
                            return IN_TEXT;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_PRE
                /* inside <pre>...</pre>, not counting the two tags.
          While processing lead/trail tags will be IN_TAG.
         However inside <pre>...</pre>, e.g. <i> will be treated an IN_PRE, not IN_TAG. */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            tagCategory =
                                    // allow for / and >, < already parsed.
                                    TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                        2 ) ) );
                            emit.append( '<' );
                            switch ( tagCategory )
                                {
                                // we treat tags in side <pre>...</pre> as pre text.
                                case COMMENT:/* treat like pre */
                                case INVALID:
                                case LEFT_TRIM:
                                case PLAIN:
                                case RIGHT_TRIM:
                                    return IN_PRE;// count tag inside <pre>... </pre> an IN_PRE.
                                case PRE:
                                    err.println( "Error: <pre> unbalanced" + where() );
                                    return IN_PRE;
                                case SLASH_PRE:
                                    return IN_TAG;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case DASH:
                        case END:
                        case QUOTE:
                        case SPACE:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_PRE;
                        case IGNORE:
                            return IN_PRE;
                        case NL:
                            // keep empty lines in <pre
                            lineNumber++;
                            emit.append( '\n' );
                            return IN_PRE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TAG/* inside <xxx  ...> or </xxxx ...> */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            if ( lookAhead( 3 ).equals( "!--" ) )
                                {
                                err.println( "Error: Can't have <!-- comments inside tags" + where() );
                                // leave it alone. we can't fix it.
                                emit.append( "<" );
                                }
                            else
                                {
                                err.println( "Warning: < corrected to &lt;" + where() );
                                emit.append( "&lt;" );
                                }
                            return IN_TAG;
                        case END:
                            emit.append( '>' );
                            // tag we encountered back after previous <
                            switch ( tagCategory )
                                {
                                case COMMENT:
                                    // we already complained.
                                case LEFT_TRIM:// can't be preceding space or NL
                                case PLAIN:
                                case SLASH_PRE:
                                    return IN_TEXT;
                                case INVALID:
                                    throw new IllegalArgumentException(
                                            "program bug: Invalid tag encountered in IN_TAG state." );
                                case PRE:
                                    return IN_PRE;
                                case RIGHT_TRIM:
                                    return IN_DEAD_SPACE;

                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case DASH:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_TAG;
                        case IGNORE:
                            return IN_TAG;
                        case NL:
                            // leave NLs inside tags.
                            lineNumber++;
                            emit.append( '\n' );// ignore preceding white space.
                            return IN_TAG_DEAD_SPACE;
                        case QUOTE:
                            emit.append( '\"' );
                            return IN_TAG_QUOTE;
                        case SPACE:
                            return IN_TAG_COMPACTIBLE_SPACE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TAG_COMPACTIBLE_SPACE/* inside spaces inside a tag, multiple spaces to be collapsed to one */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            if ( lookAhead( 3 ).equals( "!--" ) )
                                {
                                err.println( "Error: Can't have <!-- comments inside tags" + where() );
                                // leave it alone. we can't fix it.
                                emit.append( " <" );
                                }
                            else
                                {
                                err.println( "Warning: < corrected to &lt;" + where() );
                                emit.append( " &lt;" );
                                }
                            return IN_TAG;
                        case DASH:
                        case TEXT:
                            emit.append( ' ' );// collapse all previous spaces down to one
                            emit.append( nextChar );
                            return IN_TAG;
                        case END:
                            // trailing space before > is not only collapsible, it can be discarded altogether.
                            emit.append( ">" );
                            // tag we encountered back after previous <
                            switch ( tagCategory )
                                {
                                case COMMENT:
                                case LEFT_TRIM:
                                case PLAIN:
                                case SLASH_PRE:
                                    return IN_TEXT;
                                case INVALID:
                                    throw new IllegalArgumentException(
                                            "program bug: Invalid tag encountered in IN_TAG_COMPACTABLE_SPACE state." );
                                case PRE:
                                    return IN_PRE;
                                case RIGHT_TRIM:
                                    return IN_DEAD_SPACE;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case IGNORE:
                        case SPACE:
                            return IN_TAG_COMPACTIBLE_SPACE;
                        case NL:
                            // keep NLs inside tags
                            lineNumber++;
                            emit.append( '\n' );// ignore preceding white space.
                            return IN_TAG_DEAD_SPACE;
                        case QUOTE:
                            emit.append( " \"" );
                            return IN_TAG_QUOTE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TAG_DEAD_SPACE/* inside lead spaces on line tag split over lines,
         or immediately after first <.  Any spaces following will be totally discarded. */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            err.println( " Warning:< corrected to &lt;" + where() );
                            emit.append( "&lt;" );
                            return IN_TAG;
                        case DASH:
                        case TEXT:
                            // ignore lead space
                            emit.append( nextChar );
                            return IN_TAG;
                        case END:
                            emit.append( '>' );
                            // tag we encountered after previous <
                            switch ( tagCategory )
                                {
                                case COMMENT:
                                case LEFT_TRIM:
                                case PLAIN:
                                case SLASH_PRE:
                                    return IN_TEXT;
                                case INVALID:
                                    throw new IllegalArgumentException(
                                            "program bug: Invalid tag encountered in IN_TAG state." );
                                case PRE:
                                    return IN_PRE;
                                case RIGHT_TRIM:
                                    return IN_DEAD_SPACE;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case IGNORE:
                        case NL:
                        case SPACE:
                            // remove empty lines inside tags
                            return IN_TAG_DEAD_SPACE;
                        case QUOTE:
                            emit.append( '\"' );
                            return IN_TAG_QUOTE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TAG_QUOTE/* inside "..." */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                        case DASH:
                        case END:
                        case SPACE:// don't change at all
                        case TEXT:
                            emit.append( nextChar );
                            return IN_TAG_QUOTE;
                        case IGNORE:
                            return IN_TAG_QUOTE;
                        case NL:
                            err.println( "Warning: Quoted string spanning lines. Left as is." + where() );
                            lineNumber++;
                            emit.append( '\n' );
                            return IN_TAG_QUOTE;
                        case QUOTE:
                            emit.append( '\"' );
                            return IN_TAG;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TEXT/* inside ordinary HTML text, possibly an entity */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            tagCategory =
                                    // allow for / and >, < already parsed.
                                    TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                        2 ) ) );
                            switch ( tagCategory )
                                {
                                /* was no white space on left */
                                case COMMENT:
                                    charIndex += "!--".length();
                                    // Will be further later incremented by 1 by charIndexLoop.
                                    // Record what we were doing so we can pick up where we left off after comment
                                    previousTextState = IN_TEXT;
                                    if ( keepComments || isMacroComment() )
                                        {
                                        emit.append( "<!--" );
                                        // for macros, preserve trailing whitespace. Is no lead whitespace.
                                        return IN_COMMENT;
                                        }
                                    else
                                        {
                                        return STRIPPING_COMMENT;
                                        }

                                case INVALID:
                                    err.println( "Warning: < corrected to &lt;" + where() );
                                    emit.append( "&lt;" );
                                    return IN_TEXT;
                                case LEFT_TRIM:// can't be previous space or NL
                                case PLAIN:
                                case PRE:
                                case RIGHT_TRIM:
                                    emit.append( '<' );
                                    return IN_TAG;
                                case SLASH_PRE:
                                    err.println( "Error: </pre> unbalanced" + where() );
                                    emit.append( '<' );
                                    return IN_TAG;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case DASH:
                        case TEXT:
                            emit.append( nextChar );
                            return IN_TEXT;
                        case END:
                            err.println( "Warning: > corrected to &gt;" + where() );
                            emit.append( "&gt;" );
                            return IN_TEXT;
                        case IGNORE:
                            return IN_TEXT;
                        case NL:
                            // keep NLs in text
                            lineNumber++;
                            emit.append( '\n' );// was no preceding white space.
                            return IN_DEAD_SPACE;
                        case QUOTE:
                            err.println( "Warning: \" corrected to &quot;" + where() );
                            emit.append( "&quot;" );
                            return IN_TEXT;
                        case SPACE:
                            return IN_TEXT_COMPACTIBLE_SPACE;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }},
        IN_TEXT_COMPACTIBLE_SPACE/* inside spaces in ordinary text */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                            tagCategory =
                                    // allow for / and >, < already parsed.
                                    TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                        2 ) ) );
                            switch ( tagCategory )
                                {
                                case COMMENT:
                                    charIndex += "!--".length();
                                    // Will be further later incremented by 1 by charIndexLoop.
                                    // Record what we were doing so we can pick up where we left off after comment.

                                    if ( isMacroComment() )
                                        {
                                        // always keep, with lead space.
                                        emit.append( " <!--" );
                                        // no pending trailing space, but preserve any trailing space.
                                        previousTextState = IN_TEXT;
                                        return IN_COMMENT;
                                        }
                                    else if ( keepComments )
                                        {
                                        // no lead space, defer
                                        emit.append( "<!--" );
                                        // for macros, defer trailing whitespace
                                        previousTextState = IN_TEXT_COMPACTIBLE_SPACE;
                                        return IN_COMMENT;
                                        }
                                    else
                                        {
                                        // strip
                                        // will force whitespace after the comment.
                                        previousTextState = IN_TEXT_COMPACTIBLE_SPACE;
                                        return STRIPPING_COMMENT;
                                        }

                                case LEFT_TRIM:
                                    undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                    emit.append( '<' );
                                    return IN_TAG;
                                case INVALID:
                                    err.println( "Warning: < corrected to &lt;" + where() );
                                    emit.append( " &lt;" );
                                    return IN_TEXT;
                                case PLAIN:
                                case PRE:/* don't go into IN_PRE until end of <pre> */
                                case RIGHT_TRIM:
                                    emit.append( " <" );
                                    return IN_TAG;
                                case SLASH_PRE:
                                    err.println( "Error: </pre> unbalanced" + where() );
                                    emit.append( " <" );
                                    return IN_TAG;
                                default:
                                    throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                }
                        case DASH:
                        case TEXT:
                            emit.append( ' ' );// collapse all previous spaces down to one
                            emit.append( nextChar );
                            return IN_TEXT;
                        case END:
                            err.println( "Warning: > corrected to &gt;" + where() );
                            emit.append( " &gt;" );
                            return IN_TEXT;
                        case IGNORE:
                        case SPACE:
                            return IN_TEXT_COMPACTIBLE_SPACE;
                        case NL:
                            // keep NL in text
                            lineNumber++;
                            emit.append( '\n' );// ignore preceding white space.
                            return IN_DEAD_SPACE;
                        case QUOTE:
                            err.println( "Warning: \" corrected to &quot;" + where() );
                            emit.append( " &quot;" );
                            return IN_TEXT;
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }
                },

        STRIPPING_COMMENT/* inside <!-- ... --> we jump ahead when hit <!-- and --> . We are removing this comment entirely */
                {
                HTMLState next( HTMLCharCategory category, char nextChar )
                    {
                    switch ( category )
                        {
                        case BEGIN:
                        case END:
                        case IGNORE:
                        case NL:
                        case QUOTE:
                        case SPACE:
                        case TEXT:
                            // ignore everything.
                            return STRIPPING_COMMENT;
                        case DASH:
                            if ( lookAhead( 2 ).equals( "->" ) )
                                {
                                // cheat, process 2 extra chars without using state machine
                                charIndex += 2;
                                // pick up where we left off as if the comment never happened.
                                return previousTextState;
                                }
                            else
                                {
                                return STRIPPING_COMMENT;
                                }
                        default:
                            throw new IllegalArgumentException( "program bug: invalid category" );
                        }
                    }};

    // ------------------------------ FIELDS ------------------------------

    /**
     * true if want debugging output
     */
    private static final boolean DEBUGGING = false;

    /**
     * normally true to keep comments.  false to strip them out.
     */
    private static boolean keepComments;

    /**
     * offset in big where we are processing
     */
    private static int charIndex;

    /**
     * line number we are processing in the output file. 1-based.
     */
    private static int lineNumber;

    /**
     * longest tag that will compress spaces either side of
     */
    private static final int LONGEST_COMPRESSIBLE_TAG = "blockquote".length();

    /**
     * lets us remember what we were doing before the comment so we can pick up where we left off
     */
    private static HTMLState previousTextState;

    /**
     * list of lookingAt patterns of macro-style comments that expand to text, e.g. SSI, JSP or html static macro
     * comments. start looking at end of <--
     */
    private static Pattern[] macroCommentPatterns;

    /**
     * where we accumulate the compacted output
     */
    private static StringBuilder emit;

    /**
     * big input string we are parsing
     */
    private static String big;

    /**
     * used in error messages to indicate where the error occurred, usually the name of the file being compacted.
     */
    private static String where;

    /**
     * category of the most recently encountered tag
     */
    private static TagCategory tagCategory;
    // -------------------------- PUBLIC STATIC METHODS --------------------------

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big                  the String to compact.
     * @param where                used in error messages to indicate where the error occurred, usually the name of the
     *                             file being compacted.
     * @param keepComments         usually true to keep comments.  false to strip them out.
     * @param macroCommentPatterns when keepComments=false, these are a list of regex lookingat startsWith patterns for
     *                             the exceptions, i.e. comments you want to keep anyway e.g. Pattern.compile (
     *                             "\\s*macro\\s+" ) to keep html static macro comments of the form <--__macro__  or
     *                             Pattern.compile ("#") to keep SSI comments of the form <--# These style of comments
     *                             are macros, that expand into text, either through JSP, static macros, SSI etc.
     *                             Further one char of whitespace will be preserved both before and after such a
     *                             comment. With normal comments, whitespace before or after or both collapses down to a
     *                             single whitespace char, possibly before or after the comment. Patterns are
     *                             case-sensitive unless you use (?i) to turn on case insensitivity in your pattern.
     * @return the compacted String, big itself if nothing changed.
     */
    @SuppressWarnings( { "SameParameterValue" } )
    public static String compactString( final String big,
                                        final String where,
                                        boolean keepComments,
                                        Pattern... macroCommentPatterns )
        {
        HTMLState.big = big;
        HTMLState.where = where;
        HTMLState.keepComments = keepComments;
        HTMLState.macroCommentPatterns = macroCommentPatterns;
        final int length = big.length();
        emit = new StringBuilder( length );
        lineNumber = 1;
        HTMLState state = IN_DEAD_SPACE;
        previousTextState = IN_DEAD_SPACE;
        // charIndex is static
        for ( charIndex = 0; charIndex < length; charIndex++ )
            {
            final char c = big.charAt( charIndex );
            final HTMLCharCategory category = HTMLCharCategory.categorise( c );
            if ( DEBUGGING )
                {
                System.out
                        .println( ">>>" +
                                  state.toString() +
                                  " " +
                                  tagCategory +
                                  " [" +
                                  c +
                                  "] " +
                                  category.toString() );
                }
            state = state.next( category, c );
            }
        // we don't append a final NL.
        final String result = emit.toString();
        emit = null;
        if ( result.length() == length && result.equals( big ) )
            {
            // signal to caller nothing changed, also cuts RAM use in half at next GC.
            return big;
            }
        else
            {
            return result;
            }
        }

    // -------------------------- STATIC METHODS --------------------------

    /**
     * Check configuration and regex patterns to decide if this is a macro style comment that must be preserved even if
     * other comments are stripped and whose lead and trail whitespace cannot be totally deleted.
     *
     * @return true if we are pointing at a macro-style comment that expands into  text rather than being totally
     *         ignored.
     */
    private static boolean isMacroComment()
        {
        if ( macroCommentPatterns.length == 0 )
            {
            return false;
            }
        else
            {
            // we have have only incremented past 3 of the 4 lead chars <!-- at this point.
            // potentially look ahead all the way to the end of big.
            final String lookIn = big.substring( charIndex + 1 );
            for ( Pattern pattern : macroCommentPatterns )
                {
                // We are pointing just after the <!--
                // We have not scanned for the --> yet.
                if ( pattern.matcher( lookIn ).lookingAt() )
                    {
                    // matched the pattern, it was a comment we keep
                    return true;
                    }
                }// end for
            // just an ordinary comment and we are stripping them.
            return false;
            }
        }

    /**
     * look at chars ahead in the stream yet to be processed, starting at charIndex+1
     *
     * @param howFar how many chars you want
     * @return 0 to howFar chars.
     */
    private static String lookAhead( int howFar )
        {
        final int start = charIndex + 1;
        final int end = Math.min( start + howFar, big.length() );
        if ( start >= end )
            {
            return "";
            }
        else
            {
            return big.substring( start, end );
            }
        }

    /**
     * parse candidate tag.
     *
     * @param partialTag first  LONGEST_COMPRESSIBLE_TAG+2 chars of the tag, possibly including trailing space or > and
     *                   other junk, without lead <.
     * @return tag with &lt; &gt; and trailing white space stripped e.g. dt, /dt ,!--, /blockquote
     */
    private static String parsePartialTag( final String partialTag )
        {
        // <!-- is a special case.  It can be terminated by anything, even an alphabetic
        if ( partialTag.startsWith( "!--" ) )
            {
            return ( "!--" );
            }
        // leave room for lead / but not < >
        final StringBuilder sb = new StringBuilder( LONGEST_COMPRESSIBLE_TAG + 1 );
        for ( int i = 0; i < partialTag.length(); i++ )
            {
            char c = partialTag.charAt( i );
            if ( c == '/' || 'a' <= c && c <= 'z' || c == '!' || '0' <= c && c <= '9' )
                {
                sb.append( c );
                }
            else if ( 'A' <= c && c <= 'Z' )
                {
                sb.append( Character.toLowerCase( c ) );
                }
            else
                {
                break;// terminate with any strange char, e.g. space > tab, nl, # etc.
                }
            }
        return sb.toString();
        }

    /**
     * remove any white space or NLs we have recently emitted
     */
    private static void undoRecentWhiteSpace()
        {
        outer:
        for ( int i = emit.length() - 1; i >= 0; i-- )
            {
            switch ( emit.charAt( i ) )
                {
                case ' ':
                    emit.setLength( i );
                    break;
                case '\n':
                    lineNumber--;
                    emit.setLength( i );
                    break;
                default:
                    break outer;
                }
            }
        }

    /**
     * Where are we in processing.  Used for error messages
     *
     * @return string describing where error occurred relative to the output file.
     */
    private static String where()
        {
        return " at line number: " + lineNumber + " file offset: " + emit.length() + " in: " + where;
        }

    // -------------------------- OTHER METHODS --------------------------

    /**
     * Implemented by each enum state to find next state given character. This method is the core of the finite state
     * automaton
     *
     * @param category Category of the next character to process
     * @param nextChar next character to process
     * @return next state of the automaton.
     */
    abstract HTMLState next( HTMLCharCategory category,
                             char nextChar );

    // --------------------------- main() method ---------------------------

    public static void main( String[] args )
        {
        if ( DEBUGGING )
            {
            String test =
                    "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n" +
                    " <!--  leave contents -  -   -  be    --> <!-- macro foot -->  <Td class=\"brown\">  this stuff \n" +
                    "</td>  \n" +
                    "<td   \n" +
                    "    class=\"brown\">    sit <i>still   </i> <!--tight--> <!>  very still  </td> <table\n" +
                    "    class=simple > " +
                    "\n" +
                    "<div> <dt> \n" +
                    "  contents  [an error occurred while processing this directive]