/*
 * [TestSignum.java]
 *
 * Summary: example use of java.Long.signum and alternative implementations.
 *
 * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.1 2006-02-01 use System.nanotime instead of System.CurrentTimeInMillis add wibble2 alogorithm
 *  1.2 2006-02-02 rename wibble2 to Kobzda, add signHalf
 *  1.3 2006-02-03 add piotr, more encapsulated conformance tests.
 *  1.4 2006-02-18 run entire competition twice to ensure all optimisation complete in for the
 *                 second "warm" heat.
 *                 add percent of cpuTimeToBeatInNanos
 *                 add commas to numeric printouts
 *                 cleaned up typos in comments.
 *  1.5 2006-02-19 redefine speed % so bigger is better. 100% = top contender so far. Unfortunately
 *                 it obsoletes old values when there is a new champ, so likely will leave value as is for while even if
 *                 someone gets over 100% on my hardware.
 *                 Remove benchmark results from program listings.
 *  1.6 2006-02-19 give optimisers a rest period to finish analysis and optimising after the warmup.
 *                 Emit HTML summary of results.
 *                 add lohi32 algorithm
 *  1.7 2006-02-20 drop sgn and Stephan from timing trials since they don't compute the correct results. The
 *                 code is still there if you want to hook it up again in trial. They are still part of the conformance
 *                 testing.
 *                 add glossary
 *                 adjust to discount timing overheadInNanos.
 *                 refactor to move code into BenchmarkMeasurement class, rename methods and variables
 *                 changed definition of iterations, so counts actual number of signum calls, makes it.
 *                 added elapsed vs cpu elapsedTime, and per-call elapsedTime to displays.
 *                 specify elapsedTime to beat is per-call elapsedTime.
 *  1.8 2006-02-22 add JET's machine language solution to piotr and lohi. JET got both perfect. Explanation
 *                 of why piotr beat lohi even though it takes theoretically one more machine cycle.
 */
package com.mindprod.example;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;

import static java.lang.System.*;

/**
 * example use of java.Long.signum and alternative implementations.
 * <p/>
 * Compare ten different ways of doing a signum.
 * <p/>
 * Results of benchmarks to find the fastest Long.signum implementation.
 * <p/>
 * BenchmarkMeasurement results are posted at http://mindprod.com/jgloss/benchmark.html Terminolgy used here:
 * <dl>
 * <p/>
 * <dt>benchmark</dt>
 * <dd>one running of this program on a given hardware/software combo. Obviously you can't do more
 * than one benchmark per run.</dd>
 * <p/>
 * <dt>trial</dt>
 * <dd>a cold or warm set of measurements.</dd>
 * <p/>
 * <dt>cold</dt>
 * <dd>a trial done immediately after starting the runtime.</dd>
 * <p/>
 * <dt>warm</dt>
 * <dd>a trial done after exercising the runtime
 * and letting it rest, so that it has opportunity to do on-the-fly optimisation before you clock it.</dd>
 * <p/>
 * <dt>measurement</dt>
 * <dd>one candidate algorithm tested over a period of seconds with repeated executions of a TEST
 * suite.</dd>
 * <p/>
 * <dt>elapsed time</dt>
 * <dd>wall clock time.</dd>
 * <p/>
 * <dt>cpu time</dt>
 * <dd>time crudely adjusted to remove cpu overhead of measuring.</dd>
 * <p/>
 * <dt>call ns</dt>
 * <dd>cpu time in ns, (not clock cycles) for one signum calculation, just
 * a few nanoseconds. For a very fast machine, this number could be less than 1.</dd>
 * <p/>
 * <dt>speed</dt>
 * <dd>speed index, aka SI. HowToProcess fast this condender is relative to the record. The record holder should be
 * about 100%, others
 * proportionately less. </dd>
 * </dl>
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.8 2006-02-22 add JET's machine language solution to piotr and lohi. JET got both perfect. Explanation
 * of why piotr beat lohi even though it takes theoretically one more machine cycle.
 * @since 2009
 */
@SuppressWarnings( { "UnusedDeclaration", "JavaDoc" } )
final class TestSignum
    {
    /**
     * how many times to run the conformance tests 20,000,000 = 20 million times, number of SIGNUM invocations to handle
     * conformance tests.
     */
    private static final int SIGNUM_CONFORMANCE_INVOCATIONS = 20000000;

    /**
     * how many times to run each set of 21 diff values for one candidate. Computed in static init.
     */
    private static final int suiteIterations;

    /**
     * HowToProcess long to rest after the warm up trial to give background optimisers elapsedTime to complete their
     * analysis.
     */
    private static final long REST_PERIOD_IN_MILLIS = 10000;

    /**
     * how many times to invoke signum for each candidate in a trial times. It can be bigger than Integer.MAX_VALUE, but
     * suiteIterations which depends on it, may not. suggest 2,000,000,000L for a live TEST, 20,000,000L to TEST.
     */
    private static final long SIGNUM_INVOCATIONS = 2000000000L;

    /**
     * crucial corner values to TEST for conformance.
     */
    @SuppressWarnings( { "NumericOverflow" } )
    private static final long testSuiteValues[] = {
            Long.MIN_VALUE,
            Long.MIN_VALUE + 1,
            Long.MIN_VALUE + Integer.MIN_VALUE - 2,
            Integer.MIN_VALUE - 1,
            Integer.MIN_VALUE,
            Integer.MIN_VALUE + 1,
            Integer.MIN_VALUE + 2,
            -2,
            -1,
            0,
            1,
            2,
            Integer.MAX_VALUE - 2,
            Integer.MAX_VALUE - 1,
            Integer.MAX_VALUE,
            Integer.MAX_VALUE + 1,
            Integer.MAX_VALUE + 2,
            Long.MAX_VALUE - 2,
            Long.MAX_VALUE - 1,
            Long.MAX_VALUE };

    // 0 decimal places, commas, rounded, locale-sensitive.
    private static final DecimalFormat df0 = new DecimalFormat( "#,##0" );

    // 3 digits, left-zero, 2 decimal places, rounded, locale-sensitive.
    private static final DecimalFormat df2 = new DecimalFormat( "#,##0.00" );

    /**
     * where we accumulate the HTML version of the results.
     */
    private static final StringBuilder html = new StringBuilder( 3000 );

    /**
     * best elapsedTime so far. Contestants are rated as a percent of that performance If you change the number of
     * iterations. Defined later in static init. Constast for entire run.
     */
    private static final double cpuTimeToBeatInNanos;

    /**
     * reigning champ's best elapsedTime to complete one signum call in fractional ns.
     */
    private static final double NANOS_PER_CALL_TO_BEAT = 1.22d;

    /**
     * dummy variable stop removal of code by optimisation.
     * <p/>
     * public helps discourage the optimiser a little.
     */
    private static int globalSum;

    /**
     * elapsedTime per candidate in nanoseconds spent with dummy algorithm that does nothing. It is recomputed for each
     * trial, after all other measurements done, but before we display the results. We would rather underestimate than
     * overestimate it. It represents the elapsedTime spent looping, fetching operands, toting up the results, call/ret
     * to signum, in other words all but computing the signum itself.
     */
    private static long overheadInNanos;

    /**
     * compute other run invariants now we have configurables nailed down. This
     * code must follow static finals.
     */
    static
        {
        // compute suiteIterations, number of TEST suites to run.
        long proposedSuiteIterations =
                SIGNUM_INVOCATIONS / testSuiteValues.length;
        if ( proposedSuiteIterations >= Integer.MAX_VALUE )
            {
            throw new IllegalArgumentException( "SIGNUM_INVOCATIONS is too big." );
            }
        suiteIterations = ( int ) proposedSuiteIterations;
        cpuTimeToBeatInNanos =
                ( long ) ( NANOS_PER_CALL_TO_BEAT * SIGNUM_INVOCATIONS );
        }

    /**
     * TEST all four signum-like algorithms for conformance. To pass must return a + int for a positive long, a -ve int
     * for a negative long and 0 for a zero long.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness.
     */
    private static void checkConformanceOfAllCandidates( long diff )
        {
        // overhead deliberately does not conform. We leave item out.
        checkConformanceOfOneValue( kobzda( diff ), diff, "kobzda" );
        checkConformanceOfOneValue( lohi32( diff ), diff, "lohi32" );
        checkConformanceOfOneValue( piotr( diff ), diff, "piotr" );
        checkConformanceOfOneValue( sgn( diff ), diff, "sgn" );
        checkConformanceOfOneValue( signHalf( diff ), diff, "signHalf" );
        checkConformanceOfOneValue( signOf( diff ), diff, "signOf" );
        checkConformanceOfOneValue( stephan( diff ), diff, "stephan" );
        checkConformanceOfOneValue( Long.signum( diff ),
                diff,
                "Sun Long.signum" );
        checkConformanceOfOneValue( twoifs( diff ), diff, "twoifs" );
        checkConformanceOfOneValue( wibble( diff ), diff, "Wibble" );
        }

    /**
     * check that algorithm conforms to the gold standard signum with two ifs
     *
     * @param answer    answer TEST algorithm gave
     * @param diff      TEST number
     * @param algorithm name of algorithm
     */
    private static void checkConformanceOfOneValue( int answer,
                                                    long diff,
                                                    String algorithm )
        {
        // we don't insist on exact signum answer only than signum of answer be
        // correct.
        if ( twoifs( answer ) != twoifs( diff ) )
            {
            err.println( algorithm
                         + " fails at "
                         + df0.format( diff )
                         + " giving "
                         + df0.format( answer ) );
            }
        }

    /**
     * emit HTML to head this trail
     *
     * @param platform         short name for the platform
     * @param hardwarePlatform description of the hardware
     * @param softwarePlatform description of the software, e.g. OS, JVM
     */
    private static void echoCommandLineHTML( String platform,
                                             String hardwarePlatform,
                                             String softwarePlatform )
        {
        html.append( "" );
        html.append( "\n<tr><th colspan=\"8\"></th></tr><tr>" );
        html.append( "<th colspan=\"8\" class=\"bat\" align=\"left\">" );
        html.append( "<span class=\"exe\">&quot;java.exe&quot;</span>\n" );
        html.append( "com.mindprod.example.TestSignum\n" );
        html.append( "<span class=\"string\">&quot;" );
        html.append( platform );
        html.append( "&quot;</span>\n" + "<span class=\"string\">&quot;" );
        html.append( hardwarePlatform );
        html.append( "&quot;</span>\n" + "<span class=\"string\">&quot;" );
        html.append( softwarePlatform );
        html.append( "&quot;</span></th>\n" + "</tr>\n" );
        }

    /**
     * emit HTML to head this trail
     *
     * @param platform         short name for the platform
     * @param hardwarePlatform description of the hardware
     * @param softwarePlatform description of the software, e.g. OS, JVM
     * @param temperature      usually "Warm" or "Cold" or any other descriptive String for benchmark.
     */
    private static void emitTrialHeaderHTML( String platform,
                                             String hardwarePlatform,
                                             String softwarePlatform,
                                             String temperature )
        {
        html.append( "\n<tr><th rowspan=\"2\" align=\"left\" style=\"background:" );
        html.append( temperature.equalsIgnoreCase( "cold" )
                     ? "green"
                     : "orange" );
        html.append( "\">" );
        html.append( platform );
        html.append( "</th>\n" + "<th colspan=\"3\" align=\"left\">" );
        html.append( hardwarePlatform );
        html.append( "</th>\n" + "<th colspan=\"3\" align=\"left\">" );
        html.append( softwarePlatform );
        html.append( "</th>\n" + "<th rowspan=\"2\" align=\"center\" style=\"background:" );
        html.append( temperature.equalsIgnoreCase( "cold" )
                     ? "green"
                     : "orange" );
        html.append( "\">" );
        html.append( temperature );
        html.append( "</th>\n" + "</tr>\n" );
        html.append( BenchmarkMeasurement.toHTMLHeading() );
        }

    /**
     * aux method for stephan
     *
     * @param left  first of two numbers to compare
     * @param right second of two number to compare
     *
     * @return 1 if  first is less than the second.
     */
    private static int isLessThan( final long left, final long right )
        {
        return ( int ) -( ( left - right ) >> 63 );
        }

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Peter Kobzda, who came up with it 17 hours before Wibble, whom I earlier gave the attribution to.
     */
    private static int kobzda( long diff )
        {
        // the beauty of this method is most of the work is in int, not long
        // which works well on 32-bit machines.
        return ( int ) ( diff >>> 32 ) | ( int ) diff >>> 1 | ( int ) diff & 1;
        }

    /**
     * Pads the string out to the given length by applying blanks on the left. Method stolen and simplified from
     * com.mindprod.common18.ST to make this class standalone for pedagogical purposes.
     *
     * @param s      String to be padded/chopped.
     * @param newLen length of new String desired.
     * @param chop   true if Strings longer than newLen should be truncated to newLen chars.
     *
     * @return String padded on left/chopped to the desired length.
     */
    @SuppressWarnings( { "SameParameterValue" } )
    private static String leftPad( String s, int newLen, boolean chop )
        {
        int grow = newLen - s.length();
        if ( grow <= 0 )
            {
            if ( chop )
                {
                return s.substring( 0, newLen );
                }
            else
                {
                return s;
                }
            }
        else
            {
            // com.mindprod.common18.ST handles problem of out of range
            // subscript here.
            return "                      ".substring( 0, grow ) + s;
            }
        } // end leftPad

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Roedy Green, Canadian Mind Products
     */
    private static int lohi32( long diff )
        {
        /**
         * <pre>
         *   I designed this algorithm with the 32-bit Pentium architecture in mind.
         *   It makes no sense for a 64-bit machine.
         *   I am trying to coax the optimiser into generating inline
         *   Pentium code like this that would take only 3 clock cycles:
         *   diff=edx:eax  result=eax
         *   TEST edx,edx ; TEST hi 32 bits
         *   jz isZero    ; lo is the result, already eax=lo
         *   mov eax,edx  ; result is hi
         * isZero:
         *  JET used the stack rather than edx:eax for parameter passing,
         *  but other than that it hit got my intended algorithm bang on.
         * </pre>
         */
        final int hi = ( int ) ( diff >>> 32 );
        if ( hi != 0 )
            {
            return hi;
            }
        else
            {
            return ( int ) diff;
            }
        }

    /**
     * Time the Kobzda signum implementation, formerly known as wibble2
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureKobzda()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += kobzda( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the lohi32 signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureLoHi32()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += lohi32( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the how much overheadInNanos there is in measuring
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureOverhead()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                // Overhead will be underestimated if compiler
                // precomputes localSum.
                // We would prefer to underestimate that overestimate.
                // We are sightly overestimating the overheadInNanos by
                // including an
                // long to int conversion, which should be optimised away.
                // If we were to leave item out, the overheadInNanos of
                // maintaining diff
                // for the loop would be lost.
                localSum += ( int ) diff;
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the piotr signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measurePiotr()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += piotr( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the signOf signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    @SuppressWarnings( "unused" )
    private static long measureSgn()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += sgn( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the signHalf signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureSignHalf()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += signHalf( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the signOf signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureSignOf()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += signOf( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the Stephan's signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    @SuppressWarnings( "unused" )
    private static long measureStephan()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += stephan( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the sun's signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureSun()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                // Sun's Long.signum is implemented as:
                // (int) ((i >> 63) | (-i >>> 63));
                localSum += Long.signum( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the standard signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureTwoifs()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += twoifs( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * Time the wibble signum implementation
     *
     * @return nanoseconds elapsed time to run the testSuite for this candidate.
     */
    private static long measureWibble()
        {
        int localSum = 0;
        long start = System.nanoTime();
        for ( int i = 0; i < suiteIterations; i++ )
            {
            for ( long diff : testSuiteValues )
                {
                localSum += wibble( diff );
                }
            }
        globalSum += localSum;
        return System.nanoTime() - start;
        }

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction. In Pentium assembler
     * you could implement this algorthm with following code:
     * <p/>
     * <p/>
     * <p/>
     * <pre>
     *  diff = edx:eax result = eax
     *  mov ebx,eax
     *  shl eax,1
     *  or  eax,ebx
     *  slr eax,1
     *  or  eax,edx
     *  which would take 5 cycles, 2 more that lohi.  However, JET did even
     * better,
     *  with code essentially this using a clever trick to implement piotr.
     *   lea    ecx,0(eax,eax)  ; shifts lo left by doubling, keeps copy of lo
     *   or     eax,ecx
     *   shr    eax,1
     *   or     eax,edx
     *  This is 4 cycles, still one more than lohi. Why was Piotr so much
     * faster
     * on JET?
     *  Piotr has no pipeline-confounding jumps. Further, the lo then high
     * operands actually
     *  come from the ram-based stack. Piotr nicely separates the accesses
     * giving plenty of
     *  for pre-emptive fetch of hi. lohi insists on having them both upfront,
     * so it has to wait
     *  for memory access. Piotr does not have to wait.
     *  Modern CPUS hurry up and wait for RAM most of the time.
     * </pre>
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Peter Kobzda
     */
    private static int piotr( long diff )
        {
        return ( int ) ( diff >>> 32 ) | ( ( int ) diff | ( int ) diff << 1 ) >>> 1;
        }

    /**
     * Thomas Hawtin's algorithm Alternate to signum for use in compare. Not a true signum, since it returns ints other
     * than +-1. Where there is any possibility of overflow, you should compare two longs with < rather than
     * subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Thomas Hawtin
     */
    private static int sgn( long diff )
        {
        return ( ( int ) ( diff >> 63 ) ) | ( ( int ) ( ( ~diff ) >>> 63 ) );
        }

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Peter Kobzda
     */
    private static int signHalf( long diff )
        {
        return ( diff <= 0 ) ? ( int ) ( diff >>> 32 ) : 1;
        }

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author Roedy Green, Canadian Mind Products
     */
    private static int signOf( long diff )
        {
        return ( diff != 0 ) ? ( ( int ) ( diff >>> 32 ) | 1 ) : 0;
        }

    /**
     * stephan Ram's signum algorithm
     *
     * @param diff usually represents the difference of two long.
     *
     * @return signum of diff, +1, 0 or -1.
     * @author Stephan Ram
     */
    private static int stephan( long diff )
        {
        return isLessThan( 0, diff ) - isLessThan( diff, 0 );
        }

    /**
     * Run a suite of tests on all candidates, both conformance and speed. We run this twice.
     *
     * @param platform         short description of platform
     * @param hardwarePlatform description of the hardware
     * @param softwarePlatform description of the software, e.g. OS, JVM
     * @param temperature      usually "Warm" or "Cold" or any other descriptive String for benchmark.
     */
    private static void trial( String platform,
                               String hardwarePlatform,
                               String softwarePlatform,
                               String temperature )
        {
        out.println();
        out.println( "---- "
                     + hardwarePlatform
                     + " : "
                     + softwarePlatform
                     + " : "
                     + temperature
                     + " ----" );
        emitTrialHeaderHTML( platform,
                hardwarePlatform,
                softwarePlatform,
                temperature );
        out.println( "Checking conformance on crucial corner values..." );
        for ( long diff : testSuiteValues )
            {
            checkConformanceOfAllCandidates( diff );
            }
        out.println( "Checking conformance on "
                     + df0.format( SIGNUM_CONFORMANCE_INVOCATIONS )
                     + " random longs..." );
        Random wheel = new Random();
        for ( int i = 0; i < SIGNUM_CONFORMANCE_INVOCATIONS; i++ )
            {
            checkConformanceOfAllCandidates( wheel.nextLong() );
            }
        out.println( "Running timing tests with "
                     + df0.format( SIGNUM_INVOCATIONS )
                     + " signum invocations per candidate." );
        ArrayList<BenchmarkMeasurement> results =
                new ArrayList<>( 15 );
        results.add( new BenchmarkMeasurement( measureKobzda(), "kobzda" ) );
        results.add( new BenchmarkMeasurement( measureLoHi32(), "lohi32" ) );
        results.add( new BenchmarkMeasurement( measurePiotr(), "piotr" ) );
        // sgn gives wrong answers
        // results.add( new BenchmarkMeasurement( measureSgn(), "sgn" ) );
        results.add( new BenchmarkMeasurement( measureSignHalf(),
                "signHalf" ) );
        results.add( new BenchmarkMeasurement( measureSignOf(), "signOf" ) );
        // stephan gives wrong answers
        // results.add( new BenchmarkMeasurement( measureStephan(), "stephan" )
        // );
        results
                .add( new BenchmarkMeasurement( measureSun(),
                        "Sun Long.signum" ) );
        results.add( new BenchmarkMeasurement( measureTwoifs(), "twoifs" ) );
        results.add( new BenchmarkMeasurement( measureWibble(), "wibble" ) );
        // finish off line of dots
        out.println();
        // measure elapsedTime spent with dummy algorithm that does nothing
        // recomputed for each trial
        overheadInNanos = measureOverhead();
        out.println(
                "Timing overheadInNanos, (included below in elapsed time but not cpu time),\n"
                + "is "
                + df0.format( overheadInNanos )
                + " nanoseconds per candidate."
        );
        /* sort fastest first, in ascending order my elapsedTime. */
        Collections.sort( results );
        /* print BenchmarkMeasurement results, fastest first */
        out.println( BenchmarkMeasurement.toStringHeading() );
        for ( BenchmarkMeasurement result : results )
            {
            // display results for a contender
            out.println( result );
            // prepare html version of results for a contender
            html.append( result.toHTML() );
            }
        out.println( "finished trial" );
        }

    /**
     * Collapse number down to +1 0 or -1 depending on sign. Typically used in compare routines to collapse a difference
     * of two longs to an int.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return true signum of diff, +1, 0 or -1.
     * @author Roedy Green, Canadian Mind Products
     */
    private static int twoifs( long diff )
        {
        if ( diff > 0 )
            {
            return 1;
            }
        if ( diff < 0 )
            {
            return -1;
            }
        else
            {
            return 0;
            }
        }

    /**
     * alternate to signum for use in compare. Not a true signum, since it returns ints other than +-1. Where there is
     * any possibility of overflow, you should compare two longs with < rather than subtraction.
     *
     * @param diff number to be collapsed to an int preserving sign and zeroness. usually represents the difference of
     *             two long.
     *
     * @return sign of diff, some -ve int, 0 or some -ve int.
     * @author wibble
     */
    private static int wibble( long diff )
        {
        return ( int ) ( diff >>> 32 )
               | ( ( int ) diff >>> 1 ) & Integer.MAX_VALUE
               | ( int ) diff & 1;
        }

    /**
     * Test harness to compare the various signum methods.
     *
     * @param args * [0] = platform parameter in quotes, a short description of the platform you are testing, e.g. JET.
     *             <p/>
     *             [1] = hardware platform parameter in quotes, a description of the platform you are testing.
     *             <p/>
     *             e.g. "Athlon XP 2000+ 1.692 GHz 512MB"
     *             <p/>
     *             [2] = software platform description e.g. "Win2K Sun java.exe 1.5 -server"
     */
    public static void main( String[] args )
        {
        if ( args.length != 3 )
            {
            throw new IllegalArgumentException( "Must have "
                                                + "\"short platform desc\" "
                                                + "\"hardware platform desc\" "
                                                + "\"software platform desc\" "
                                                + "on command line." );
            }
        String platform = args[ 0 ].trim();
        String hardwarePlatform = args[ 1 ].trim();
        String softwarePlatform = args[ 2 ].trim();
        echoCommandLineHTML( platform, hardwarePlatform, softwarePlatform );
        // run a cold set of tests, HotSpot style optimisers will kick in
        // presumably some elapsedTime during this TEST, or during the rest
        // after they have gathered enough statistics.
        trial( platform, hardwarePlatform, softwarePlatform, "Cold" );
        // Rest for a short while to give background, low priority optimising
        // threads elapsedTime to catch up and optimise.
        out.println( "Resting to give optimisers a chance..." );
        try
            {
            Thread.sleep( REST_PERIOD_IN_MILLIS );
            }
        catch ( InterruptedException e )
            {
            out.println( "awakened prematurely" );
            }
        // by now, optimisations should be complete.
        trial( platform, hardwarePlatform, softwarePlatform, "Warm" );
        // display the HTML summary on the console,
        // not to a file to simplify use without configuring.
        // You can redirect item to files with append etc. as you see fit.
        err.println( html );
        out.println( "all done" );
        }

    /**
     * This is how Sun's Long.signum is implemented.
     *
     * @param diff usually represents the difference of two long.
     *
     * @return signum of diff, +1, 0 or -1.
     * @author Lee Boynton of Sun
     */
    public static int sunSignum( long diff )
        {
        return ( int ) ( ( diff >> 63 ) | ( -diff >>> 63 ) );
        }

    /**
     * nested static class for results from timing one candidate algorithm
     */
    static final class BenchmarkMeasurement implements Comparable<BenchmarkMeasurement>
        {
        /**
         * name of algorithm
         */
        final String contender;

        /**
         * raw execution elapsedTime in nanoseconds, unadjusted for overheadInNanos
         */
        final long elapsedTime;

        /**
         * constructor
         *
         * @param elapsedTime nanoseconds this benchmark took, raw elapsedTime, no overheadInNanos adjustment.
         * @param contender   name of the candidate algorithm
         */
        BenchmarkMeasurement( long elapsedTime, String contender )
            {
            // let user know something is happening
            out.print( '.' );
            this.elapsedTime = elapsedTime;
            this.contender = contender;
            }

        /**
         * Display a html header row suitable for heading up toHTML one trial.
         *
         * @return HTML for one table row header describing 5 values, elapsed, cpu, per call, speed percentage and
         * contender, with room later for notes.
         */
        public static String toHTMLHeading()
            {
            return ""
                   + "\n<tr>"
                   // blank col supplied by line above.
                   + "<th>elapsed ns</th>\n"
                   + "<th>cpu ns</th>\n"
                   + "<th>call ns</th>\n"
                   + "<th>SI</th>\n"
                   + "<th>candidate</th>\n"
                   + "<th>notes</th>\n"
                   + "</tr>\n";
            }

        /**
         * prepare a string to act as header for this.toString
         *
         * @return headings for five columns, describing one candidate's results for one trial: elapsed, cpu, per call,
         * speed percentage and contender.
         */
        public static String toStringHeading()
            {
            // _______elapsed_ns_:____________cpu_ns_:_call_ns_:_____SI_:_candidate
            // ----1,932,157,579_:_------932,157,579_:_---4.66_:_-64.37_:_piotr
            // --------14-------___-----------14----___---7--___---6---___
            return ""
                   + "       elapsed ns : "
                   + "           cpu ns : "
                   + "call ns : "
                   + "    SI : "
                   + "candidate";
            }

        /**
         * Sort by elaspedTime, fastest first.
         * Defines default the sort order for BenchmarkMeasurement Objects.
         * Compare this BenchmarkMeasurement with another BenchmarkMeasurement.
         * Compares elapsedTime.
         * Informally, returns (this-other) or +ve if this is more positive than other.
         *
         * @param other other BenchmarkMeasurement to compare with this one
         *
         * @return +ve if this&gt;other, 0 if this==other, -ve if this&lt;other
         */
        public final int compareTo( BenchmarkMeasurement other )
            {
            return Long.signum( this.elapsedTime - other.elapsedTime );
            }

        /**
         * Get speed as a percent of the current contender. usually 0..100, adjusted for overheadInNanos and
         * cpuTimeToBeatInNanos.
         *
         * @return speed
         */
        public double getSpeedIndex()
            {
            // Rate candidate as percent of nominal best performance.
            // bigger is better.
            // 100% is the current record holder.
            return ( cpuTimeToBeatInNanos / ( double ) ( this
                                                                 .elapsedTime - overheadInNanos ) ) * 100.0d;
            }

        /**
         * Display benchmark result in as a HTML table row for one contender in one trial.
         *
         * @return HTML for one table row describing 5 values, elapsed, cpu, per call, speed percentage and contender.
         */
        public String toHTML()
            {
            // display total elapsed elapsedTime in a fixed width field
            String elapsedNanosToDisplay = df0.format( this.elapsedTime );
            // display total cpu elapsedTime in a fixed width field
            String cpuNanosToDisplay =
                    df0.format( this.elapsedTime - overheadInNanos );
            // display cpu/elapsedTime per signum call in a fixed width field
            String cpuPerInvocationNanosToDisplay = df2
                    .format( ( double ) ( this.elapsedTime - overheadInNanos )
                             / ( double ) SIGNUM_INVOCATIONS );
            String speedToDisplay = df2.format( this.getSpeedIndex() );
            return ""
                   + "<tr><td></td><td>"
                    /* indent one col */
                   + elapsedNanosToDisplay
                   + "</td><td>"
                   + cpuNanosToDisplay
                   + "</td><td>"
                   + cpuPerInvocationNanosToDisplay
                   + "</td><td>"
                   + speedToDisplay
                   + "</td><td>"
                   + contender
                   + "</td></tr>\n"/* last col not used */;
            }

        /**
         * Display benchmark result in a human-comprehensible String, for one contender in one trial.
         *
         * @return comma decorated String describing elapsed, cpu, per call, speed percentage and contender. Time is
         * always 17 chars wide and speed is always 7, not counting colons and spaces to separate fields.
         */
        public String toString()
            {
            // _______elapsed_ns_:____________cpu_ns_:_call_ns_:_____SI_:_candidate
            // ----1,932,157,579_:_------932,157,579_:_---4.66_:_-64.37_:_piotr
            // --------14-------___-----------14----___---7--___---6---___
            // display total elapsed elapsedTime in a fixed width field
            String elapsedNanosToDisplay = df0.format( this.elapsedTime );
            elapsedNanosToDisplay = leftPad( elapsedNanosToDisplay, 17, false );
            // display total cpu elapsedTime in a fixed width field
            String cpuNanosToDisplay =
                    df0.format( this.elapsedTime - overheadInNanos );
            cpuNanosToDisplay = leftPad( cpuNanosToDisplay, 17, false );
            // display cpu/elapsedTime per signum call in a fixed width field
            String cpuPerInvocationNanosToDisplay = df2
                    .format( ( double ) ( this.elapsedTime - overheadInNanos )
                             / ( double ) SIGNUM_INVOCATIONS );
            cpuPerInvocationNanosToDisplay =
                    leftPad( cpuPerInvocationNanosToDisplay, 7, false );
            String speedToDisplay = df2.format( this.getSpeedIndex() );
            speedToDisplay = leftPad( speedToDisplay, 6, false );
            return elapsedNanosToDisplay
                   + " : "
                   + cpuNanosToDisplay
                   + " : "
                   + cpuPerInvocationNanosToDisplay
                   + " : "
                   + speedToDisplay
                   + " : "
                   + this
                           .contender;
            }
        }
    }