Snippet : HTMLCharCategory.java

/*
 * [HTMLCharCategory.java]
 *
 * Summary: categorises HTML chars for the finite state automaton parse in order to remove excess whitespace.
 *
 * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
 */
package com.mindprod.compactor;

/**
 * categorises HTML chars for the finite state automaton parse in order to remove excess whitespace.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
 * @since 2009
 */
enum HTMLCharCategory
    {
        DASH,
        IGNORE,
        BEGIN_TAG,
        END_TAG,
        NL,
        QUOTE,
        SPACE,
        TEXT;

    /**
     * true if we ignore \r characters, since in windows they come in pairs \r\n CrLf
     * We always GENERATE \n since it is the most compact form.
     */
    private static final boolean ignoreCr =
            System.getProperty( "line.separator" ).equals( "\r\n" );

    /**
     * decide which category a char belongs to
     *
     * @param c character to categorise
     *
     * @return the category of the character
     */
    static HTMLCharCategory categorise( char c )
        {
        switch ( c )
            {
            // control chars in decimal
            case ' ':
            case '\t':
            case 0:
            case 1:
            case 2:
            case 3:
            case 4:
            case 5:
            case 6:
            case 7:
            case 8:
                // case 9: tab \t
                // case 10: lf
            case 11:
            case 12:
                // case 13: cr
            case 14:
            case 15:
            case 16:
            case 17:
            case 18:
            case 19:
            case 20:
            case 21:
            case 22:
            case 23:
            case 24:
            case 25:
            case 26:
            case 27:
            case 28:
            case 29:
            case 30:
            case 31:
                // don't include non-breaking space.
                return SPACE;
            case '\n':
                return NL;
            case '\r':
                return ignoreCr ? IGNORE : NL;
            case '<':
                return BEGIN_TAG;
            case '>':
                return END_TAG;
            case '\"':
                return QUOTE;
            case '-':
                return DASH;
            default:
                // including punctuation, accented chars...
                return TEXT;
            }
        }
    }