package com.mindprod.compactor;
/**
* categorises HTML chars for the finite state automaton parse in order to remove excess whitespace.
*
* @author Roedy Green, Canadian Mind Products
* @version 2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
* @since 2009
*/
enum HTMLCharCategory
{
DASH,
IGNORE,
BEGIN_TAG,
END_TAG,
NL,
QUOTE,
SPACE,
TEXT;
/**
* true if we ignore \r characters, since in windows they come in pairs \r\n CrLf
* We always GENERATE \n since it is the most compact form.
*/
private static final boolean ignoreCr =
System.getProperty( "line.separator" ).equals( "\r\n" );
/**
* decide which category a char belongs to
*
* @param c character to categorise
*
* @return the category of the character
*/
static HTMLCharCategory categorise( char c )
{
switch ( c )
{
case ' ':
case '\t':
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
case 11:
case 12:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
case 28:
case 29:
case 30:
case 31:
return SPACE;
case '\n':
return NL;
case '\r':
return ignoreCr ? IGNORE : NL;
case '<':
return BEGIN_TAG;
case '>':
return END_TAG;
case '\"':
return QUOTE;
case '-':
return DASH;
default:
return TEXT;
}
}
}