Snippet : UTF8Encoder.java

/*
 * [UTF8Encoder.java]
 *
 * Summary: write/encode String into UTF-8 encoded bytes, without using Java's built-in encoders.
 *
 * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.0 2006-02-24
 */
// UTF8Encoder
package com.mindprod.example;

import com.mindprod.common18.EIO;

import java.io.UnsupportedEncodingException;

import static java.lang.System.*;

/**
 * write/encode String into UTF-8 encoded bytes, without using Java's built-in encoders.
 * <p/>
 * Gives an 8-bit byte array.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.0 2006-02-24
 * @since 2006-02-24
 */
public final class UTF8Encoder
    {
    /**
     * true if you want the TEST harness to ensure this code works.
     */
    private static final boolean DEBUGGING = false;

    /**
     * byte order mark as a character.
     */
    private static final char BOM = ( char ) 0xfeff;

    /**
     * encode a String into UTF-8 bytes.  We handle only 16-bit chars.
     * <p/>
     * <p/>
     * UTF-8 is normally encoded simply with String.getBytes( "UTF-8") or with an OutputStreamWriter but this is roughly
     * what goes on under the hood, if you ever need to write your own encoder for some non-Java platform, or you are
     * just curious how it works.
     * <p/>
     * This works for 16-bit characters only. It does not handle 32-bit characters encoded with the contortionist use of
     * the low (0xdc00..0xdfff) and high(0xd800..0xdbff) bands of surrogate characters.
     *
     * @param input string to encoded with UTF-8.
     *
     * @return string encoded in UTF-8 byte string.
     */
    private static byte[] encode( String input )
        {
        // worst case, all chars could require 3-byte encodings.
        byte[] output = new byte[ input.length() * 3 ];
        // index output[]
        int j = 0;
        for ( int i = 0; i < input.length(); i++ )
            {
            int c = input.charAt( i );
            if ( c < 0x80 )
                {
                // 7-bits done in one byte.
                output[ j++ ] = ( byte ) c;
                }
            else if ( c < 0x800 )
                {
                // 8-11 bits done in 2 bytes
                output[ j++ ] = ( byte ) ( 0xC0 | c >> 6 );
                output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
                }
            else
                {
                // 12-16 bits done in 3 bytes
                output[ j++ ] = ( byte ) ( 0xE0 | c >> 12 );
                output[ j++ ] = ( byte ) ( 0x80 | c >> 6 & 0x3F );
                output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
                }
            } // end for
        // Prune back our byte array.  For efficiency we could hand item back
        // partly filled, which is only a minor inconvenience to the caller
        // most of the time to save copying the array.
        byte[] chopped = new byte[ j ];
        System.arraycopy( output, 0, chopped, 0, j/* length */ );
        return chopped;
        } //end encode

    /**
     * TEST harness to ensure UTF8Decoder works as advertised
     *
     * @param args not used
     *
     * @throws java.io.UnsupportedEncodingException if no support for UTF-8, not likely.
     */
    public static void main( String[] args ) throws UnsupportedEncodingException
        {
        if ( DEBUGGING )
            {
            String test =
                    BOM
                    + "Hello World"
                    + "\u0080\u007f\u0080\u0100\u0921\u30b0\u4e70\uffff";
            char[] oneOfAlmostEverything = new char[ 0xffff + 1 ];
            for ( int i = 0; i <= 0xffff; i++ )
                {
                oneOfAlmostEverything[ i ] = ( char ) i;
                }
            // avoid testing low band surrogates
            for ( int i = 0xdc00; i <= 0xdfff; i++ )
                {
                oneOfAlmostEverything[ i ] = 0;
                }
            // avoid testing high band surrogates
            for ( int i = 0xd800; i <= 0xdbff; i++ )
                {
                oneOfAlmostEverything[ i ] = 0;
                }
            // put one of almost every possible 16-bit Unicode character in our TEST too.
            test += new String( oneOfAlmostEverything );
            // convert to UTF-8 with built-in Java classes.
            byte[] encodedByJava = test.getBytes( EIO.UTF8 );
            // convert to UTF-8 with UTF8Encoder.
            byte[] encodedByUs = UTF8Encoder.encode( test );
            boolean allOk = true;
            if ( encodedByUs.length != encodedByJava.length )
                {
                out.println( "oops, different lengths" );
                allOk = false;
                }
            int safe = Math.min( encodedByJava.length, encodedByUs.length );
            for ( int i = 0; i < safe; i++ )
                {
                if ( encodedByUs[ i ] != encodedByJava[ i ] )
                    {
                    out.println( "oops "
                                 + encodedByJava[ i ]
                                 + "["
                                 + Integer.toHexString( encodedByJava[ i ] )
                                 + "] "
                                 + encodedByUs[ i ]
                                 + "["
                                 + Integer.toHexString( encodedByUs[ i ] )
                                 + "]" );
                    allOk = false;
                    } // end if
                } // end for
            out.println( "UTF8Encoder " + ( allOk ? "worked" : "failed" ) );
            }
        } // end main
    } // end UTF8Encoder