package com.mindprod.example;
import com.mindprod.filter.OnlyDirectoriesFilter;
import com.mindprod.filter.OnlyFilesFilter;
import sun.misc.CRC16;
import java.io.File;
import java.io.FilenameFilter;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.zip.Adler32;
import java.util.zip.CRC32;
import static java.lang.System.*;
/**
* interface that an algorithm must provide to be benchmarked
*/
interface HashAlgorithm
{
/**
* identifying info
*/
String algorithmName();
/**
* how many bits in the raw computed hash
*/
int bits();
/**
* It need not include code to convert to string.
* Fast version, used for timing
*/
void computeHash( String s );
/**
* compute hash, must be converted to a String for collision checking.
* Slow version, used for collision checking.
*/
String computeHashString( String s );
}
/***
* [ public | protected | private ]
* static
* abstract
* synchronized
* [ transient | volatile ]
* final
* native
*/
/**
* Test hash functions for speed and ability to avoid collisions.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2014-04-26 initial version
* @since 2014-04-26
*/
public final class HashBakeOff
{
/**
* true if want extra progress messages
*/
private static final boolean DEBUGGING = false;
/**
* drive to look for filenames (will be unique)
*/
private static final char DRIVE_TO_LOOK_FOR_FILENAMES = 'E';
/**
* how many filenames to computer hashes on
*/
private static final int COUNT_OF_FILES_TO_COLLECT = 50_00050_000;
/**
* our collection of real world filenames to use in testing hashes
*/
private static final ArrayList<String> COLLECTION = new ArrayList<>( COUNT_OF_FILES_TO_COLLECT );
/**
* how many times to compute the hash of each file..
*/
private static final int HEATS = 500;
/**
* how many hashes computer per algorithm.
*/
private static final int TRIALS = HEATS * COUNT_OF_FILES_TO_COLLECT;
/**
* format for times is seconds
*/
private static final DecimalFormat ELAPSED_TIME_FORMAT = new DecimalFormat( "###,##0.00" );
/**
* format for integer counts
*/
private static final DecimalFormat INT_FORMAT = new DecimalFormat( "###,##0" );
/**
* filter to get just dirs
*/
private static final FilenameFilter SAFE_DIR_FILTER = new OnlyDirectoriesFilter();
/**
* filter to get just files
*/
private static final FilenameFilter SAFE_FILE_FILTER = new OnlyFilesFilter();
/**
* how many filenames we have collected so far.
*/
private static int countOfFilesCollected = 0;
/**
* which algorith me are computing
*/
private static HashAlgorithm delegate = null;
/**
* collect some real world filename to test
*/
private static void collectFilenames()
{
out.println( "Collecting "
+ INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
+ " filenames from drive "
+ DRIVE_TO_LOOK_FOR_FILENAMES
+ "..." );
if ( !collectFilesFromDirTree( new File( DRIVE_TO_LOOK_FOR_FILENAMES + ":/" ) ) )
{
err.println( "failed to collect sufficient filenames from drive "
+ DRIVE_TO_LOOK_FOR_FILENAMES
+ ":. wanted: "
+ INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
+ " collected: "
+ INT_FORMAT.format( countOfFilesCollected )
);
System.exit( 1 );
}
assert countOfFilesCollected == COUNT_OF_FILES_TO_COLLECT :
"bug: wrong number of filenames collected";
}
/**
* recursively scan directory tree collecting filenames.
*
* @param theDir dir to scan
*
* @return true if collected enough files
* @noinspection SameParameterValue, WeakerAccess
*/
private static boolean collectFilesFromDirTree( File theDir )
{
if ( theDir == null )
{
return true;
}
String[] allDirs = theDir.list( SAFE_DIR_FILTER );
if ( allDirs != null )
{
for ( String subDir : allDirs )
{
if ( collectFilesFromDirTree( new File( theDir, subDir ) ) )
{
return true;
}
}
}
final String[] allFilesInDirToCollect = theDir.list( SAFE_FILE_FILTER );
if ( allFilesInDirToCollect != null )
{
for ( String aFile : allFilesInDirToCollect )
{
if ( collectOneFilename( new File( theDir, aFile ) ) )
{
return true;
}
}
}
return false;
}
/**
* returns true when we gave collected enough files
*
* @param c filanem to collect
*
* @return true if we are done
*/
private static boolean collectOneFilename( File c )
{
if ( ++countOfFilesCollected >= COUNT_OF_FILES_TO_COLLECT )
{
return true;
}
COLLECTION.add( c.getAbsolutePath() );
return false;
}
/**
* compute and time the hashcodes for this algorithm.
*/
private static void computeHashes()
{
if ( DEBUGGING )
{
out.println( "Computing "
+ INT_FORMAT.format( TRIALS )
+ " hashes..." );
}
long start = System.nanoTime();
for ( int heat = 0; heat < HEATS; heat++ )
{
for ( String filename : COLLECTION )
{
delegate.computeHash( filename );
}
}
long stop = System.nanoTime();
final double elapsedSeconds = ( double ) ( stop - start ) / 1_000_000_0001_000_000_000d;
out.println( "Time to compute "
+ INT_FORMAT.format( TRIALS )
+ " iterations of "
+ delegate.bits()
+ "-bit "
+ delegate.algorithmName()
+ " was "
+ ELAPSED_TIME_FORMAT.format( elapsedSeconds )
+ " seconds." );
}
/**
* count numbmer of collision swith this algothithm.
*/
private static void countCollisions()
{
if ( DEBUGGING )
{
out.println( "Checking "
+ INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
+ " filenames for collisions..." );
}
HashSet<String> collider = new HashSet<>( COUNT_OF_FILES_TO_COLLECT * 2 );
int collisionCount = 0;
for ( String filename : COLLECTION )
{
String hash = delegate.computeHashString( filename );
if ( !collider.add( hash ) )
{
collisionCount++;
}
}
out.println( "There were "
+ INT_FORMAT.format( collisionCount )
+ " collisions in "
+ INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT )
+ " filenames" );
}
/**
* Collect a set of 20,000 real world filenames.
* Compute hash of each one of the filenames, timing it.
* In a second pass, record results in a bit map/HashMap looking for collisions.
*
* @param args not used
*/
public static void main( String[] args )
{
collectFilenames();
HashAlgorithm[] algorithms = {
new SunHashCode(),
new FNV1a32() ,
new FNV1a64(),
new CmpCRC16(),
new SunCRC32() ,
new AdlerHash() ,
new MessageDigestHash( "MD5" ),
new MessageDigestHash( "SHA-1" ),
new MessageDigestHash( "SHA" ),
new MessageDigestHash( "SHA-256" ),
new MessageDigestHash( "SHA-384" ),
new MessageDigestHash( "SHA-512" ),
new SunCRC16() ,
new MessageDigestHash( "MD2" ),
};
out.println( "The best have the lowest elpased times and the fewest collisions." );
for ( HashAlgorithm algorithm : algorithms )
{
delegate = algorithm;
if ( DEBUGGING )
{
out.println( "\nBenchmarking " + delegate.bits() + "-bit " + delegate.algorithmName() + "..." );
}
computeHashes();
countCollisions();
out.println( "------" );
}
}
}
class SunHashCode implements HashAlgorithm
{
/**
* identifying info
*/
public String algorithmName()
{
return "String.hashCode";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 32;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
@SuppressWarnings( "UnusedAssignment" ) int dummy = s.hashCode();
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
return Integer.toString( s.hashCode() );
}
}
class AdlerHash implements HashAlgorithm
{
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* don't allocate an new Adler32 for every calculation.
*/
private final Adler32 digester = new Adler32();
/**
* identifying info
*/
public String algorithmName()
{
return "Adler";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 32;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
digester.getValue();
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
return Long.toString( digester.getValue() );
}
}
class SunCRC32 implements HashAlgorithm
{
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* don't allocate an new Adler32 for every calculation.
*/
private final CRC32 digester = new CRC32();
/**
* identifying info
*/
public String algorithmName()
{
return "SunCRC32";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 32;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
digester.getValue();
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
return Long.toString( digester.getValue() );
}
}
class SunCRC16 implements HashAlgorithm
{
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* don't allocate an new Adler32 for every calculation.
*/
private final CRC16 digester = new CRC16();
/**
* identifying info
*/
public String algorithmName()
{
return "SunCRC16";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 16;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
for ( byte b : theTextToDigestAsBytes )
{
digester.update( b );
}
@SuppressWarnings( "UnusedAssignment" ) final int dummy = digester.value;
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
for ( byte b : theTextToDigestAsBytes )
{
digester.update( b );
}
return Integer.toString( digester.value );
}
}
class FNV1a32 implements HashAlgorithm
{
/**
* prime specified to use in 32 bit hashes
*/
private static final int MAGIC_PRIME = 16_777_61916_777_619;
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* magic 32 bit initial value for the hash = 2,166,136,261
*/
private static final int MAGIC_OFFSET = ( int ) ( 2_166_136_2612_166_136_261L & 0xffffffffL );
/**
* Calc 34 bit FNV1a digest of an array of bytes.
*
* @param theTextToDigestAsBytes byte array to compute checksum on
*
* @return 64-bit signed fnv1a checksum
*/
private static long fnv1a64( byte[] theTextToDigestAsBytes )
{
long hash = MAGIC_OFFSET;
for ( byte b : theTextToDigestAsBytes )
{
hash ^= b & 0xff;
hash *= MAGIC_PRIME;
}
return hash;
}
/**
* identifying info
*/
public String algorithmName()
{
return "FNV1a32";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 64;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
fnv1a64( theTextToDigestAsBytes );
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
return Long.toString( fnv1a64( theTextToDigestAsBytes ) );
}
}
class FNV1a64 implements HashAlgorithm
{
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* magic 64 bit initial value for the hash = 14,695,981,039,346,656,037, needs to be unsigned to fit in long.
*/
@SuppressWarnings( "NumericOverflow" )
private static final long MAGIC_OFFSET = 14_695_981_039_346_656_0314_695_981_039_346_656_03L * 10 + 7;
/**
* prime specified to use in 64 bit hashes
* magic 64 bit FNV_prime = 240 + 28 + 0xb3 = 1,099,511,628,211
*/
private static final long MAGIC_PRIME = 1_099_511_628_2111_099_511_628_211L;
/**
* Calc 64 bit FNV1a digest of an array of bytes.
*
* @param theTextToDigestAsBytes byte array to compute checksum on
*
* @return 64-bit signed fnv1a checksum
*/
private static long fnv1a64( byte[] theTextToDigestAsBytes )
{
long hash = MAGIC_OFFSET;
for ( byte b : theTextToDigestAsBytes )
{
hash ^= b & 0xff;
hash *= MAGIC_PRIME;
}
return hash;
}
/**
* identifying info
*/
public String algorithmName()
{
return "FNV1a64";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 64;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
fnv1a64( theTextToDigestAsBytes );
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
return Long.toString( fnv1a64( theTextToDigestAsBytes ) );
}
}
class MessageDigestHash implements HashAlgorithm
{
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
private final String algorithmName;
private final MessageDigest digester;
/**
* constructor
*
* @param algorithmName name of algorithm to use MD2, MD5, SHA-1, SHA-256, SHA-384, SHA-512
*/
MessageDigestHash( String algorithmName )
{
this.algorithmName = algorithmName;
MessageDigest init = null;
try
{
init = MessageDigest.getInstance( algorithmName );
}
catch ( NoSuchAlgorithmException e )
{
err.println( "Missing " + algorithmName + "support " );
System.exit( 2 );
}
digester = init;
}
/**
* identifying info
*/
public String algorithmName()
{
return algorithmName;
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return digester.getDigestLength() * 8;
}
/**
* it need not include code to convert to string. USed for timing
*/
public void computeHash( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
@SuppressWarnings( "UnusedAssignment" ) byte[] dummy = digester.digest();
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
digester.reset();
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
digester.update( theTextToDigestAsBytes );
byte[] digest = digester.digest();
return new String( digest, UTF8 );
}
}
class CmpCRC16 implements HashAlgorithm
{
/**
* generator polynomial
*/
private static final int POLY = 0x1021;/* x16 + x12 + x5 + 1 generator polynomial */
/**
* encoding for UTF-8
*/
private static final Charset UTF8 = Charset.forName( "UTF-8" );
/**
* scrambler lookup table for fast computation.
*/
private static final int[] CRC_TABLE = new int[ 256 ];
static
{
for ( int i = 0; i < 256; i++ )
{
int fcs = 0;
int d = i << 8;
for ( int k = 0; k < 8; k++ )
{
if ( ( ( fcs ^ d ) & 0x8000 ) != 0 )
{
fcs = ( fcs << 1 ) ^ POLY;
}
else
{
fcs = ( fcs << 1 );
}
d <<= 1;
fcs &= 0xffff;
}
CRC_TABLE[ i ] = fcs;
}
}
/**
* Calc 16-bit CRC with CCITT method.
*
* @param ba byte array to compute CRC on
*
* @return 16-bit CRC, unsigned
*/
private static int crc16( byte[] ba )
{
int work = 0xffff;
for ( byte b : ba )
{
work = ( CRC_TABLE[ ( b ^ ( work >>> 8 ) ) & 0xff ] ^ ( work << 8 ) ) &
0xffff;
}
return work;
}
/**
* identifying info
*/
public String algorithmName()
{
return "CMP CRC16";
}
/**
* how many bits in the raw computed hash
*/
public int bits()
{
return 16;
}
/**
* it need not include code to convert to string. Used for timing
*/
public void computeHash( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
int dummy = crc16( theTextToDigestAsBytes );
}
/**
* compute hash, must be converted to a String for collision checking
*/
public String computeHashString( String s )
{
final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 );
return Integer.toString( crc16( theTextToDigestAsBytes ) );
}
}