package com.mindprod.example;
import com.mindprod.common18.EIO;
import java.io.UnsupportedEncodingException;
import static java.lang.System.*;
/**
* write/encode String into UTF-8 encoded bytes, without using Java's built-in encoders.
* <p/>
* Gives an 8-bit byte array.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2006-02-24
* @since 2006-02-24
*/
public final class UTF8Encoder
{
/**
* true if you want the TEST harness to ensure this code works.
*/
private static final boolean DEBUGGING = false;
/**
* byte order mark as a character.
*/
private static final char BOM = ( char ) 0xfeff;
/**
* encode a String into UTF-8 bytes. We handle only 16-bit chars.
* <p/>
* <p/>
* UTF-8 is normally encoded simply with String.getBytes( "UTF-8") or with an OutputStreamWriter but this is roughly
* what goes on under the hood, if you ever need to write your own encoder for some non-Java platform, or you are
* just curious how it works.
* <p/>
* This works for 16-bit characters only. It does not handle 32-bit characters encoded with the contortionist use of
* the low (0xdc00..0xdfff) and high(0xd800..0xdbff) bands of surrogate characters.
*
* @param input string to encoded with UTF-8.
*
* @return string encoded in UTF-8 byte string.
*/
private static byte[] encode( String input )
{
byte[] output = new byte[ input.length() * 3 ];
int j = 0;
for ( int i = 0; i < input.length(); i++ )
{
int c = input.charAt( i );
if ( c < 0x80 )
{
output[ j++ ] = ( byte ) c;
}
else if ( c < 0x800 )
{
output[ j++ ] = ( byte ) ( 0xC0 | c >> 6 );
output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
}
else
{
output[ j++ ] = ( byte ) ( 0xE0 | c >> 12 );
output[ j++ ] = ( byte ) ( 0x80 | c >> 6 & 0x3F );
output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
}
}
byte[] chopped = new byte[ j ];
System.arraycopy( output, 0, chopped, 0, j);
return chopped;
}
/**
* TEST harness to ensure UTF8Decoder works as advertised
*
* @param args not used
*
* @throws java.io.UnsupportedEncodingException if no support for UTF-8, not likely.
*/
public static void main( String[] args ) throws UnsupportedEncodingException
{
if ( DEBUGGING )
{
String test =
BOM
+ "Hello World"
+ "\u0080\u007f\u0080\u0100\u0921\u30b0\u4e70\uffff";
char[] oneOfAlmostEverything = new char[ 0xffff + 1 ];
for ( int i = 0; i <= 0xffff; i++ )
{
oneOfAlmostEverything[ i ] = ( char ) i;
}
for ( int i = 0xdc00; i <= 0xdfff; i++ )
{
oneOfAlmostEverything[ i ] = 0;
}
for ( int i = 0xd800; i <= 0xdbff; i++ )
{
oneOfAlmostEverything[ i ] = 0;
}
test += new String( oneOfAlmostEverything );
byte[] encodedByJava = test.getBytes( EIO.UTF8 );
byte[] encodedByUs = UTF8Encoder.encode( test );
boolean allOk = true;
if ( encodedByUs.length != encodedByJava.length )
{
out.println( "oops, different lengths" );
allOk = false;
}
int safe = Math.min( encodedByJava.length, encodedByUs.length );
for ( int i = 0; i < safe; i++ )
{
if ( encodedByUs[ i ] != encodedByJava[ i ] )
{
out.println( "oops "
+ encodedByJava[ i ]
+ "["
+ Integer.toHexString( encodedByJava[ i ] )
+ "] "
+ encodedByUs[ i ]
+ "["
+ Integer.toHexString( encodedByUs[ i ] )
+ "]" );
allOk = false;
}
}
out.println( "UTF8Encoder " + ( allOk ? "worked" : "failed" ) );
}
}
}