/*
 * [Compactor.java]
 *
 * Summary: Compacts HTML by removing unnecessary white space.
 *
 * Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  2.2 2006-03-15 Suppress IntelliJ Code Analyse that wants to make this default scope.
 *  2.3 2008-02-15 complete rewrite, mainly to handle removing space around <dt> <li> <h?> and <td> tags.
 *  2.4 2008-02-15 add more tags that get trimmed. Charge $10
 *  2.5 2008-02-28 tighter removal of whitespace surrounding comments.
 *  2.6 2008-02-28 optionally allow comments to be stripped out
 *                 entirely. Preserve some space around configurable magic
 *                 macro comments that expand into text such as <!# SSI or
 *                 <!-- macro.
 *  2.7 2008-07-27 remove all space just before > in a tag. < space will convert to &lt; space.
 *  2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
 *  2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings.
 *  3.0 2010-02-12 trim space inside <p>..</p>.
 *  3.1 2010-12-21 avoid touching JavaScript and other scripts.
 *  3.2 2010-12-24 handle <script and <?
 *  3.3 2011-11-15 add compactStringAsNeeded
 *  3.4 2012-06-18 no longer strip \n in front of <script
 *  3.5 2012-10-27 show more context where there is an error.
 *  3.6 2013-03-01 no longer complain about unescaped " in text.
 *  3.7 2014-04-21 now allow *.htmlfrag files.
 *  3.8 2014-06-11 add HTML5 tags. Simplify tag categorisation code.
 *  3.9 2014-07-26 internal simplication. Now preserves space both before and after comment which may be significant.
 *                 Now always compact space inside comments, even macro comments.
 */
package com.mindprod.compactor;

import com.mindprod.commandline.CommandLine;
import com.mindprod.common18.EIO;
import com.mindprod.filter.AllButSVNDirectoriesFilter;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.hunkio.HunkIO;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Pattern;

import static java.lang.System.*;

/**
 * Compacts HTML by removing unnecessary white space.
 * <p/>
 * We always compact whitespace inside and outside comments.
 * <p/>
 * We don't consolidate tags. e.g.
 * &lt;span class=&quot;x&quot;&gt;this &lt;/span&gt;&lt;span class=&quot;x&quot;&gt;and that&lt;/span&gt; can be
 * collapsed
 * to &lt;span class=&quot;x&quot;&gt;this and that&lt;/span&gt;.
 * <p/>
 * We don't convert tags to lower case e.g. &lt;BR&gt; to &lt;br&gt;
 * <p/>
 * We leave all comments in place. If ever such a feature is implemented, it must
 * not strip SSI comments. It may or may not leave macro comments.
 * <p/>
 * We do not remove macro generations. You can do that with StripGenerated.
 * <p/>
 * We do not remove the macro comments.
 * <p/>
 * We remove space and NLs on the right of &lt;div&gt;&lt;dt&gt;&lt;li&gt;&lt;h?&gt;&lt;ol&gt;&lt;table&gt;&lt;
 * tbody&gt;&lt;td&gt;&lt;th&gt;&lt;thead&gt;&lt;tr&gt;&lt;ul&gt;  tags.
 * <p/>
 * We remove space and NLs on the lift of &lt;/div&gt;&lt;/dt&gt;&lt;/li&gt;&lt;/h?&gt;&lt;/ol&gt;&lt;/table&gt;&lt;
 * /tbody&gt;&lt;/td&gt;&lt;/th&gt;&lt;/thead&gt;&lt;/tr&gt;&lt;/ul&gt; tags.
 * <p/>
 * We always remove lead and trailing spaces from lines.
 * <p/>
 * We compact spaces in side HTML text, tags and comments.
 * <p/>
 * We leave spaces as is inside &lt;pre&gt;...&lt;/pre&gt; and inside quoted tag parameters.
 * <p/>
 * We convert &quot; to &amp;quot; &gt; to &amp;gt; when used in raw text.
 * <p/>
 * We don't tokenize to convert to CBF, compact binary format. The catch here is web
 * browsers can't read the result without a plug-in. This would result in a major
 * compaction. Perhaps the XML folk will eventually get disgusted with their obese
 * format and XHTML can inherit a now compact form.
 * <p/>
 * We don't do any LZW compression. the catch is, browsers can't read this without a
 * special plug-in.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 3.9 2014-07-26 internal simplication. Now preserves space both before and after comment which may be significant.
 * @see com.mindprod.compactor.HTMLState
 * @since 2006
 */
public class Compactor
    {
    /**
     * keep macrcos and SSI and PAD comments
     */
    static final Pattern MACRO_PATTERN = Pattern.compile( "(?:\\s*(?:macro|generated|/generated|alias|cross|PAD|/PAD))|#" );

    /**
     * <!--# SSI comment pattern, also keep <!-- PAD Program_Version for Canadian Sales Tax Calculator -->4.4<!-- /PAD -->
     */
    static final Pattern JUST_SSI_PATTERN = Pattern.compile( "(?:\\s*(?:PAD|/PAD))|#" );

    // todo: sometimes leaves \n between comments. Sometimes does not.  Should be consistent.
    private static final int FIRST_COPYRIGHT_YEAR = 2006;

    /**
     * undisplayed copyright notice
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String EMBEDDED_COPYRIGHT =
            "Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com";

    /**
     * date this version was released.
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String RELEASE_DATE = "2014-07-26";

    /**
     * how to use the command line
     */
    private static final String USAGE = "\nCompactor needs a filename.html or a space-separated list of filenames, " +
                                        "with optional -s -q -v switches.";

    /**
     * embedded version string.
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String VERSION_STRING = "3.9";

    /**
     * constructor
     */
    public Compactor()
        {
        }

    /**
     * compact and tidy one file.
     *
     * @param quiet              true if want progress messages suppressed
     * @param fileBeingProcessed the file currently being processed.
     *
     * @throws IOException Suppress IntelliJ Code Analyse that wants to make this private.
     * @noinspection WeakerAccess, SameParameterValue, StringEquality
     */
    public static void compactFile( boolean quiet, File fileBeingProcessed ) throws IOException
        {
        if ( !quiet )
            {
            out.print( "  compacting " + fileBeingProcessed.getName() + " " );
            }
        switch ( EIO.getExtension( fileBeingProcessed ) )
            {
            // this is just a double check on ExtensionListFilter in main
            case "html":
            case "htm":
            case "htmlfrag":
                break;
            default:
                err.println( "Cannot compact: "
                             + fileBeingProcessed.getName()
                             + "not .html .htm .htmlfrag file" );
                return;
            }
        String big = HunkIO.readEntireFile( fileBeingProcessed );
        // we don't allow stripping macros and comments. Doing it to original is dangerous without StripGenerated
        // balance checking
        String result = compactStringKeepingMacrosAndComments( big, fileBeingProcessed.getPath() );
        // use == not equals() because compare already done in compactStringKeepingMacrosAndComments.
        if ( result == big )
            {
            // nothing changed. No need to write results.
            if ( !quiet )
                {
                out.println( "-" );
                }
            return;
            }
        // generate output into a temporary file until we are sure all is ok.
        // create a temp file in the same directory as filename
        if ( !quiet )
            {
            out.println( "*" );
            }
        final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed );
        FileWriter emit = new FileWriter( tempFile );
        emit.write( result );
        emit.close();
        HunkIO.deleteAndRename( tempFile, fileBeingProcessed );
        } // end compactFile

    /**
     * compact a String as needed
     *
     * @param uncompacted uncompacted string
     * @param where       where this string came from, used in error messages to help you track down source
     * @param how         *=compactStringStrippingMacrosAndComments,  including <!--cseignore--><!--/cseignore-->
     *                    +=compactStringKeepingMacrosAndComments
     *                    -=does nothing
     *                    Q=Quick If first 400 chars contain a double space, compactStringKeepingMacrosAndComments,
     *                    otherwise do nothing.
     *
     * @return compacted String
     */
    public static String compactStringAsNeeded( final String uncompacted, final String where, final char how )
        {
        switch ( how )
            {
            case '*':
                return Compactor.compactStringStrippingMacrosAndComments( uncompacted, where );
            case '+':
                return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where );
            case '-':
                return uncompacted;
            case 'Q':
            case 'q':
                final String test = ( uncompacted.length() < 400 ) ? uncompacted : uncompacted.substring( 0, 400 );
                if ( test.contains( " " + " " ) )
                    {
                    return uncompacted;
                    }
                else
                    {
                    return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where );
                    }
            default:
                assert false : "invalid Compactor.compactStringAsNeeded.how " + how + " It must be one of * + - Q";
                return uncompacted;
            }
        }

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringKeepingMacrosAndComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, null );
        }

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringKeepingMacrosStrippingComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, MACRO_PATTERN );
        }

    /**
     * Remove excess whitespace from HTML represented by string, strip all macros and comments.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringStrippingMacrosAndComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, JUST_SSI_PATTERN );
        }

    /**
     * compacts HTML files.
     *
     * @param args names of files to process, dirs, files, -s, *.*, no wildcards.
     */
    public static void main( String[] args )
        {
        // gather all the files mentioned on the command line.
        // either directories, files, *.*, with -s and subdirs option.
        out.println( "Gathering html files to compact..." );
        CommandLine commandLine = new CommandLine( args,
                new AllButSVNDirectoriesFilter(),
                new ExtensionListFilter( ExtensionListFilter.COMMON_HTML_EXTENSIONS ) );
        // There is a a double check on ExtensionListFilter in compactFile
        final boolean quiet = commandLine.isQuiet();
        if ( commandLine.size() == 0 )
            {
            throw new IllegalArgumentException( "No files found to process\n" + USAGE );
            }
        final Compactor compactor = new Compactor();
        for ( File file : commandLine )
            {
            try
                {
                compactFile( quiet, file );
                }
            catch ( FileNotFoundException e )
                {
                err.println( "Error: "
                             + EIO.getCanOrAbsPath( file )
                             + " not found." );
                }
            catch ( Exception e )
                {
                err.println();
                e.printStackTrace( err );
                err.println( " in file "
                             + EIO.getCanOrAbsPath( file ) );
                err.println();
                }
            } // end for
        } // end main
    } // end Compactor