Snippet : HashBakeOff.java

/*
 * [HashBakeOff.java]
 *
 * Summary: Test hash functions for speed and ability to avoid collisions.
 *
 * Copyright: (c) 2014-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.0 2014-04-26 initial version
 */
package com.mindprod.example;

import com.mindprod.filter.OnlyDirectoriesFilter;
import com.mindprod.filter.OnlyFilesFilter;
import sun.misc.CRC16;

import java.io.File;
import java.io.FilenameFilter;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.zip.Adler32;
import java.util.zip.CRC32;

import static java.lang.System.*;
/*  prints:
Time to compute 25,000,000 iterations of 32-bit String.hashCode was 0.393 seconds.
There were 0 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 32-bit FNV1a32 was 5.044 seconds.
There were 1 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 64-bit FNV1a64 was 4.217 seconds.
There were 0 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 32-bit SunCRC32 was 6.133 seconds.
There were 1 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 32-bit Adler was 6.327 seconds.
There were 946 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 128-bit MD5 was 12.967 seconds.
There were 0 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 160-bit SHA-1 was 23.390 seconds.
There were 0 collisions in 50,000 filenames
------
Time to compute 25,000,000 iterations of 160-bit SHA was 22.690 seconds.
There were 0 collisions in 50,000 filenames
------
 */

/**
 * interface that an algorithm must provide to be benchmarked
 */
interface HashAlgorithm
    {
    /**
     * identifying info
     */
    String algorithmName();

    /**
     * how many bits in the raw computed hash
     */
    int bits();

    /**
     * It need not include code to convert to string.
     * Fast version, used for timing
     */
    void computeHash( String s );

    /**
     * compute hash, must be converted to a String for collision checking.
     * Slow version, used for collision checking.
     */
    String computeHashString( String s );
    }
/***
 * [ public | protected | private ]
 * static
 * abstract
 * synchronized
 * [ transient | volatile ]
 * final
 * native
 */

/**
 * Test hash functions for speed and ability to avoid collisions.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.0 2014-04-26 initial version
 * @since 2014-04-26
 */
public final class HashBakeOff
    {
    /**
     * true if want extra progress messages
     */
    private static final boolean DEBUGGING = false;

    /**
     * drive to look for filenames (will be unique)
     */
    private static final char DRIVE_TO_LOOK_FOR_FILENAMES = 'E';

    /**
     * how many filenames to computer hashes on
     */
    private static final int COUNT_OF_FILES_TO_COLLECT = 50_00050_000;

    /**
     * our collection of real world filenames to use in testing hashes
     */
    private static final ArrayList<String> COLLECTION = new ArrayList<>( COUNT_OF_FILES_TO_COLLECT );

    /**
     * how many times to compute the hash of each file..
     */
    private static final int HEATS = 500;

    /**
     * how many hashes computer per algorithm.
     */
    private static final int TRIALS = HEATS * COUNT_OF_FILES_TO_COLLECT;

    /**
     * format for times is seconds
     */
    private static final DecimalFormat ELAPSED_TIME_FORMAT = new DecimalFormat( "###,##0.00" );

    /**
     * format for integer counts
     */
    private static final DecimalFormat INT_FORMAT = new DecimalFormat( "###,##0" );

    /**
     * filter to get just dirs
     */
    private static final FilenameFilter SAFE_DIR_FILTER = new OnlyDirectoriesFilter();

    /**
     * filter to get just files
     */
    private static final FilenameFilter SAFE_FILE_FILTER = new OnlyFilesFilter();

    /**
     * how many filenames we have collected so far.
     */
    private static int countOfFilesCollected = 0;

    /**
     * which algorith me are computing
     */
    private static HashAlgorithm delegate = null;

    /**
     * collect some real world filename to test
     */
    private static void collectFilenames()
        {
        out.println( "Collecting "
                     + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
                     + " filenames from drive "
                     + DRIVE_TO_LOOK_FOR_FILENAMES
                     + "..." );
        if ( !collectFilesFromDirTree( new File( DRIVE_TO_LOOK_FOR_FILENAMES + ":/" ) ) )
            {
            err.println( "failed to collect sufficient filenames from drive "
                         + DRIVE_TO_LOOK_FOR_FILENAMES
                         + ":. wanted: "
                         + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
                         + " collected: "
                         + INT_FORMAT.format( countOfFilesCollected )
            );
            System.exit( 1 );
            }
        assert countOfFilesCollected == COUNT_OF_FILES_TO_COLLECT :
                "bug: wrong number of filenames collected";
        } // /method

    /**
     * recursively scan directory tree collecting filenames.
     *
     * @param theDir dir to scan
     *
     * @return true  if collected enough files
     * @noinspection SameParameterValue, WeakerAccess
     */
    private static boolean collectFilesFromDirTree( File theDir )
        {
        if ( theDir == null )
            {
            return true;
            }
        // clean out child dirs
        // all immediate child subdirs of this dir, excluding files in this immediate dir
        String[] allDirs = theDir.list( SAFE_DIR_FILTER );
        if ( allDirs != null )
            {
            for ( String subDir : allDirs )
                {
                if ( collectFilesFromDirTree( new File( theDir, subDir ) ) )
                    {
                    return true;
                    }
                }
            }
        // now collect files in this immediate dir
        final String[] allFilesInDirToCollect = theDir.list( SAFE_FILE_FILTER );
        if ( allFilesInDirToCollect != null )
            {
            for ( String aFile : allFilesInDirToCollect )
                {
                if ( collectOneFilename( new File( theDir, aFile ) ) )
                    {
                    return true;
                    }
                }
            }
        return false;
        } // /method

    /**
     * returns true when we gave collected enough files
     *
     * @param c filanem to collect
     *
     * @return true if we are done
     */
    private static boolean collectOneFilename( File c )
        {
        if ( ++countOfFilesCollected >= COUNT_OF_FILES_TO_COLLECT )
            {
            return true;
            }
        COLLECTION.add( c.getAbsolutePath() );
        return false;
        } // /method

    /**
     * compute and time the hashcodes for this algorithm.
     */
    private static void computeHashes()
        {
        if ( DEBUGGING )
            {
            out.println( "Computing "
                         + INT_FORMAT.format( TRIALS )
                         + " hashes..." );
            }
        long start = System.nanoTime();
        for ( int heat = 0; heat < HEATS; heat++ )
            {
            for ( String filename : COLLECTION )
                {
                delegate.computeHash( filename );
                }
            }
        long stop = System.nanoTime();
        // nanoseconds are billionths, not millionths of seconds.
        // TimeUnit.NANSECONDS.toSeconds would not give us any decimal places.
        final double elapsedSeconds = ( double ) ( stop - start ) / 1_000_000_0001_000_000_000d;
        out.println( "Time to compute "
                     + INT_FORMAT.format( TRIALS )
                     + " iterations of "
                     + delegate.bits()
                     + "-bit "
                     + delegate.algorithmName()
                     + " was "
                     + ELAPSED_TIME_FORMAT.format( elapsedSeconds )
                     + " seconds." );
        } // /method

    /**
     * count numbmer of collision swith this algothithm.
     */
    private static void countCollisions()
        {
        if ( DEBUGGING )
            {
            out.println( "Checking "
                         + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
                         + " filenames for collisions..." );
            }
        // be generous. This is a one-shot program.
        HashSet<String> collider = new HashSet<>( COUNT_OF_FILES_TO_COLLECT * 2 );
        int collisionCount = 0;
        for ( String filename : COLLECTION )
            {
            String hash = delegate.computeHashString( filename );
            if ( !collider.add( hash ) )
                {
                collisionCount++;
                }
            }
        out.println( "There were "
                     + INT_FORMAT.format( collisionCount )
                     + " collisions in "
                     + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
                     + " filenames" );
        } // /method

    /**
     * Collect a set of 20,000 real world filenames.
     * Compute hash of each one of the filenames, timing it.
     * In a second pass, record results in a bit map/HashMap looking for collisions.
     *
     * @param args not used
     */
    public static void main( String[] args )
        {
        // we collect a set of real filenames.
        collectFilenames();
        HashAlgorithm[] algorithms = {
                /* best first, fastest, fewest collisions */
                new SunHashCode(),
                new FNV1a32()  /* has a collision */,
                new FNV1a64(),
                new CmpCRC16(), /* has many collisions */
                new SunCRC32() /* has a collision */,
                new AdlerHash() /* has 946 collisions */,
                new MessageDigestHash( "MD5" ),
                new MessageDigestHash( "SHA-1" ),
                new MessageDigestHash( "SHA" ),
                new MessageDigestHash( "SHA-256" ),
                new MessageDigestHash( "SHA-384" ),
                new MessageDigestHash( "SHA-512" ),
                new SunCRC16() /* has collisions */,
                new MessageDigestHash( "MD2" ),
        };
        out.println( "The best have the lowest elpased times and the fewest collisions." );
        for ( HashAlgorithm algorithm : algorithms )
            {
            delegate = algorithm; // make globally known
            if ( DEBUGGING )
                {
                out.println( "\nBenchmarking " + delegate.bits() + "-bit " + delegate.algorithmName() + "..." );
                }
            computeHashes();
            countCollisions();
            out.println( "------" );
            }
        } // /method
    } // end class

class SunHashCode implements HashAlgorithm
    {
    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "String.hashCode";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 32;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        // sun has the big advantage of working directly with a string.
        // it does not the the conversion to bytes first.
        @SuppressWarnings( "UnusedAssignment" ) int dummy = s.hashCode();
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        return Integer.toString( s.hashCode() );
        } // /method
    } // end class

class AdlerHash implements HashAlgorithm
    {
    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * don't allocate an new Adler32 for every calculation.
     */
    private final Adler32 digester = new Adler32();

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "Adler";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 32;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        digester.getValue();
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        return Long.toString( digester.getValue() );
        } // /method
    }

class SunCRC32 implements HashAlgorithm
    {
    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * don't allocate an new Adler32 for every calculation.
     */
    private final CRC32 digester = new CRC32();

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "SunCRC32";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 32;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        digester.getValue();
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        return Long.toString( digester.getValue() );
        } // /method
    }

class SunCRC16 implements HashAlgorithm
    {
    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * don't allocate an new Adler32 for every calculation.
     */
    private final CRC16 digester = new CRC16();

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "SunCRC16";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 16;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        for ( byte b : theTextToDigestAsBytes )
            {
            digester.update( b );
            }
        @SuppressWarnings( "UnusedAssignment" ) final int dummy = digester.value;
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        // Note. There is no update byte[] method.
        // We get result with value, not getValue.
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        for ( byte b : theTextToDigestAsBytes )
            {
            digester.update( b );
            }
        return Integer.toString( digester.value );  // actually a short
        } // /method
    }

class FNV1a32 implements HashAlgorithm
    {
    /**
     * prime specified to use in 32 bit hashes
     */
    private static final int MAGIC_PRIME = 16_777_61916_777_619; /* magic 32 bit FNV_prime = 224 + 28 + 0x93 = 16,777,619 */

    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * magic 32 bit initial value for the hash = 2,166,136,261
     */
    private static final int MAGIC_OFFSET = ( int ) ( 2_166_136_2612_166_136_261L & 0xffffffffL ); /* fake unsigned 32 bit */

    /**
     * Calc 34 bit FNV1a digest of an array of bytes.
     *
     * @param theTextToDigestAsBytes byte array to compute checksum on
     *
     * @return 64-bit signed fnv1a checksum
     */
    private static long fnv1a64( byte[] theTextToDigestAsBytes )
        {
        // you would think a 32-bit hash would work an int at a time, not a byte at a time.
        long hash = MAGIC_OFFSET;
        for ( byte b : theTextToDigestAsBytes )
            {
            hash ^= b & 0xff;  // account for strange signedness of byte
            hash *= MAGIC_PRIME;
            }
        return hash;
        } // /method

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "FNV1a32";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 64;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        fnv1a64( theTextToDigestAsBytes );
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        return Long.toString( fnv1a64( theTextToDigestAsBytes ) );
        } // /method
    }

class FNV1a64 implements HashAlgorithm
    {
    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * magic 64 bit initial value for the hash = 14,695,981,039,346,656,037, needs to be unsigned to fit in long.
     */
    @SuppressWarnings( "NumericOverflow" )
    private static final long MAGIC_OFFSET = 14_695_981_039_346_656_0314_695_981_039_346_656_03L * 10 + 7; /* 14_695_981_039_346_656_037L faked unsigned */

    /**
     * prime specified to use in 64 bit hashes
     * magic 64 bit FNV_prime = 240 + 28 + 0xb3 = 1,099,511,628,211
     */
    private static final long MAGIC_PRIME = 1_099_511_628_2111_099_511_628_211L;

    /**
     * Calc 64 bit FNV1a digest of an array of bytes.
     *
     * @param theTextToDigestAsBytes byte array to compute checksum on
     *
     * @return 64-bit signed fnv1a checksum
     */
    private static long fnv1a64( byte[] theTextToDigestAsBytes )
        {
        // you would think a 64 bit has work a long at a time, not a byte at a time.
        long hash = MAGIC_OFFSET;
        for ( byte b : theTextToDigestAsBytes )
            {
            hash ^= b & 0xff;  // account for strange signedness of byte
            hash *= MAGIC_PRIME;
            }
        return hash;
        } // /method

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "FNV1a64";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 64;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        fnv1a64( theTextToDigestAsBytes );
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        return Long.toString( fnv1a64( theTextToDigestAsBytes ) );
        } // /method
    }

class MessageDigestHash implements HashAlgorithm
    {
    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    private final String algorithmName;

    private final MessageDigest digester;

    /**
     * constructor
     *
     * @param algorithmName name of algorithm to use  MD2, MD5, SHA-1, SHA-256, SHA-384, SHA-512
     */
    MessageDigestHash( String algorithmName )
        {
        this.algorithmName = algorithmName;
        // dodge to init final avoidng a missing/double assignment.
        MessageDigest init = null;
        try
            {
            init = MessageDigest.getInstance( algorithmName );
            }
        catch ( NoSuchAlgorithmException e )
            {
            err.println( "Missing " + algorithmName + "support " );
            System.exit( 2 );
            }
        digester = init;
        } // end constructor

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return algorithmName;
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return digester.getDigestLength() * 8;
        } // /method

    /**
     * it need not include code to convert to string.  USed for timing
     */
    public void computeHash( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        @SuppressWarnings( "UnusedAssignment" ) byte[] dummy = digester.digest();
        // ignore 16-byte result
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        digester.reset();
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        digester.update( theTextToDigestAsBytes );
        // 16-byte result, for MD5, 20 bytes for SHA
        byte[] digest = digester.digest();
        return new String( digest, UTF8 ); //  bytes are gibberish, not UTF-8
        } // /method
    } // end class

class CmpCRC16 implements HashAlgorithm
    {
    /**
     * generator polynomial
     */
    private static final int POLY = 0x1021;/* x16 + x12 + x5 + 1 generator polynomial */
    /* 0x8408 used in European X.25 */

    /**
     * encoding for UTF-8
     */
    private static final Charset UTF8 = Charset.forName( "UTF-8" );

    /**
     * scrambler lookup table for fast computation.
     */
    private static final int[] CRC_TABLE = new int[ 256 ];

    static
        {
        // initialise scrambler table
        for ( int i = 0; i < 256; i++ )
            {
            int fcs = 0;
            int d = i << 8;
            for ( int k = 0; k < 8; k++ )
                {
                if ( ( ( fcs ^ d ) & 0x8000 ) != 0 )
                    {
                    fcs = ( fcs << 1 ) ^ POLY;
                    }
                else
                    {
                    fcs = ( fcs << 1 );
                    }
                d <<= 1;
                fcs &= 0xffff;
                }
            CRC_TABLE[ i ] = fcs;
            }
        }

    /**
     * Calc 16-bit CRC with CCITT method.
     *
     * @param ba byte array to compute CRC on
     *
     * @return 16-bit CRC, unsigned
     */
    private static int crc16( byte[] ba )
        {
        // loop, calculating CRC for each byte of the string
        int work = 0xffff;
        for ( byte b : ba )
            {
            // xor the next data byte with the high byte of what we have so far to
            // look up the scrambler.
            // xor that with the low byte of what we have so far.
            // Mask back to 16 bits.
            work = ( CRC_TABLE[ ( b ^ ( work >>> 8 ) ) & 0xff ] ^ ( work << 8 ) ) &
                   0xffff;
            }
        return work;
        } // /method

    /**
     * identifying info
     */
    public String algorithmName()
        {
        return "CMP CRC16";
        } // /method

    /**
     * how many bits in the raw computed hash
     */
    public int bits()
        {
        return 16;
        } // /method

    /**
     * it need not include code to convert to string.  Used for timing
     */
    public void computeHash( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        int dummy = crc16( theTextToDigestAsBytes );
        // ignore 16-byte result
        } // /method

    /**
     * compute hash, must be converted to a String for collision checking
     */
    public String computeHashString( String s )
        {
        final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
        return Integer.toString( crc16( theTextToDigestAsBytes ) );
        } // /method
    }