package com.mindprod.example;
import com.mindprod.common18.EIO;
import java.io.UnsupportedEncodingException;
import static java.lang.System.*;
/**
* Read/decode UTF-8 encoded bytes, without using Java's built-in decoders to give a 16-bit String.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2006-02-24
* @since 2006-02-24
*/
public final class UTF8Decoder
{
/**
* true if you want the TEST harness to ensure this code works.
*/
private static final boolean DEBUGGING = false;
/**
* byte order mark as a character.
*/
private static final char BOM = ( char ) 0xfeff;
/**
* decode a String from UTF-8 bytes. We handle only 16-bit chars.
* <p/>
* UTF-8 is normally decoded simply with new String( byte[], "UTF-8" ) or with an InputStreamReader but this is
* roughly what goes on under the hood, if you ever need to write your own decoder for some non-Java platform, or
* you are just curious how it works.
* <p/>
* This works for 16-bit characters only. It does not handle 32-bit characters encoded with the contortionist use of
* the low (0xdc00..0xdfff) and high(0xd800..0xdbff) bands of surrogate characters.
*
* @param input bytes encoded with UTF-8.
*
* @return decoded string
*/
private static String decode( byte[] input )
{
char[] output = new char[ input.length ];
int i = 0;
int j = 0;
while ( i < input.length )
{
int b = input[ i++ ] & 0xff;
switch ( b >>> 5 )
{
default:
output[ j++ ] = ( char ) ( b & 0x7f );
break;
case 6:
int y = b & 0x1f;
int x = input[ i++ ] & 0x3f;
output[ j++ ] = ( char ) ( y << 6 | x );
break;
case 7:
assert ( b & 0x10 )
== 0 : "UTF8Decoder does not handle 32-bit characters";
int z = b & 0x0f;
y = input[ i++ ] & 0x3f;
x = input[ i++ ] & 0x3f;
int asint = ( z << 12 | y << 6 | x );
output[ j++ ] = ( char ) asint;
break;
}
}
return new String( output, 0, j);
}
/**
* TEST harness to ensure UTF8Decoder works as advertised
*
* @param args not used
*
* @throws java.io.UnsupportedEncodingException if UTF-8 encoding not supported.
*/
public static void main( String[] args ) throws UnsupportedEncodingException
{
if ( DEBUGGING )
{
String test =
BOM
+ "Hello World"
+ "\u0080\u007f\u0080\u0100\u0921\u30b0\u4e70\uffff";
char[] oneOfAlmostEverything = new char[ 0xffff + 1 ];
for ( int i = 0; i <= 0xffff; i++ )
{
oneOfAlmostEverything[ i ] = ( char ) i;
}
for ( int i = 0xdc00; i <= 0xdfff; i++ )
{
oneOfAlmostEverything[ i ] = 0;
}
for ( int i = 0xd800; i <= 0xdbff; i++ )
{
oneOfAlmostEverything[ i ] = 0;
}
test += new String( oneOfAlmostEverything );
byte[] encoded = test.getBytes( EIO.UTF8 );
String reconstituted = UTF8Decoder.decode( encoded );
if ( test.equals( reconstituted ) )
{
out.println( "UTF8Decoder worked" );
}
else
{
out.println( "UTF8Decoder failed" );
out.println( test );
out.println( reconstituted );
for ( int i = 0; i < test.length(); i++ )
{
if ( reconstituted.charAt( i ) != test.charAt( i ) )
{
out.println( "oops "
+ test.charAt( i )
+ "["
+ Integer.toHexString( test.charAt( i ) )
+ "] "
+ reconstituted.charAt( i )
+ "["
+ Integer.toHexString( reconstituted.charAt(
i ) )
+ "]" );
}
}
System.exit( 1 );
}
}
}
}