Cherry Shoe Technologies: Java example on how to remove application invalid HTML tags

I used code similar to the example in this blog awhile back, where I needed a way to parse out HTML tags that were invalid to my application. The HTML code was proper HTML5 code syntax, but my application could only use specific HTML tags. This was during a data migration from an older version of Jive Software, where HTML data was being migrated over and displayed in the new application. I rolled my own java solution here, because I didn’t find a library that could do this easily at the time.

Environment:
Oracle Java 1.7.0_60

Example:

The HtmlStringInfo class holds information about the html string that needs to be processed for any bad html tags
The HTML being process must be well-formed. Void element is valid.
The example states that <h1> through <h6> and <p> tags are valid tags. The following will keep all HTML that is in the <body> that is <h1> through <h6> or <p> only. It will delete any tag that is nested inside an invalid tag
This does not currently remove any possible invalid elements nested inside valid elements. i.e. <h1><bad></bad></h1>
The following is case-sensitive, so <h2></h2> is valid but <H2></H2> is not valid.
Run using @Test method removeInvalidTagLoop

Sample HTML:
<div class=\"embedded\" id=\"thumbnail_0.jpeg\"></div><div class=\"package-entry\"><h1>HEADER</h1></div><h2>HEADER2</h2><p>PARAGRAPH</p>

After the call to removeInvalidHtmlTagElement finishes, the valid HTML that remains is:
<h2>HEADER2</h2><p>PARAGRAPH</p>

public class ValidHtmlTest {
    private static final String openBracket = "<";
    private static final String closeBracket = ">";
    private static final String slash = "/";
    private static final String space = " ";
    private static final String empty = "";

    /*
     * Holds information about the html string that needs to be processed for
     * any bad html tags.
     */
    protected class HtmlStringInfo {
        private String newString; // The updated string
        private int nextIdxToProcess; // next index to start processing
        private boolean keepLooking; // indicates if there’s other tags that’s
                                        // left in the string that have not been
                                        // processed yet

        public HtmlStringInfo(String newString, int nextIdxToProcess,
                boolean keepLooking) {
            super();
            this.newString = newString;
            this.nextIdxToProcess = nextIdxToProcess;
            this.keepLooking = keepLooking;
        }

        public String getNewString() {
            return newString;
        }

        public int getNextIdxToProcess() {
            return nextIdxToProcess;
        }

        public boolean isKeepLooking() {
            return keepLooking;
        }

        @Override
        public String toString() {
            return "HtmlStringInfo [newString=" + newString
                    + ", nextIdxToProcess=" + nextIdxToProcess
                    + ", keepLooking=" + keepLooking + "]";
        }
    }

    @Test
    public void removeInvalidTagLoop() throws Exception {
        // PREREQUISITE: Grab all XML content inside <body></body> tags, and start processing 
        
        String validTag[] = new String[] { "<h1", "<h2", "<h3", "<h4", "<h5",
                "<h6", "<p" };
        List<String> validTagList = Arrays.asList(validTag);
        
        // Start processing XML inside <body></body> tags
        String xmlString = "<div class=\"embedded\" id=\"thumbnail_0.jpeg\"></div><div class=\"package-entry\"><h1>HEADER</h1></div><h2>HEADER2</h2><p>PARAGRAPH</p>";
        
        xmlString = removeInvalidHtmlTagElement(xmlString, validTagList);

        System.out.println("Final[" + xmlString + "]");
    }

    /*
     * This function removes invalid html tags (that are not specified in the
     * validTagList), and anything in between the invalid element, from the
     * xmlString. NOTE: This does not currently remove any possible invalid
     * elements nested inside valid elements (i.e. <h1><bad></bad></h1>)
     */
    protected String removeInvalidHtmlTagElement(String xmlString,
            List<String> validTagList) {

        HtmlStringInfo tagElementInfo = null;
        // index keeps track of the index to START LOOKING for the next tag
        // element to process
        int nextIdxToProcess = 0;
        do {
            System.out.println("Start[" + xmlString + "]");
            System.out.println("xmlString.length[" + xmlString.length() + "]");

            // figure out entire contents inside the tag element to process
            int closeBracketIdx = xmlString.indexOf(closeBracket,
                    nextIdxToProcess);
            String insideBracket = xmlString.substring((nextIdxToProcess + 1),
                    closeBracketIdx);
            System.out.println(" [" + insideBracket + "]");

            tagElementInfo = getNextTagElement(xmlString, nextIdxToProcess,
                    insideBracket, validTagList);

            xmlString = tagElementInfo.getNewString();
            nextIdxToProcess = xmlString.indexOf(openBracket,
                    tagElementInfo.getNextIdxToProcess());

            // break if we are done processing the string
            if (nextIdxToProcess < 0)
                break;
        } while (tagElementInfo.isKeepLooking());

        return xmlString;
    }

    protected HtmlStringInfo getNextTagElement(String xmlString,
            int openBracketIdx, String insideBracket, List<String> validTagList) {

        // if tag is valid then do not need to strip it out
        boolean isValidTag = validTagList.contains(openBracket + insideBracket);

        int nextIdxToProcess = 0;
        boolean keepLooking = true;
        boolean isVoidElement = insideBracket.contains(slash) ? true : false;
        String strip = null;
        if (isVoidElement) {
            strip = openBracket + insideBracket + closeBracket;
            System.out.println(" strip[" + strip + "]");
        } else {
            // get the element name, need to account for attributes within the
            // element
            int elementIdx = insideBracket.contains(space) ? insideBracket
                    .indexOf(space) : insideBracket.length();
            String element = insideBracket.substring(0, elementIdx);
            System.out.println(" [" + element + "]");

            String endTag = openBracket + slash + element + closeBracket;
            // need to start at the correct index, this accounts for repeats
            int endTagIdx = xmlString.indexOf(endTag, openBracketIdx)
                    + (endTag.length());

            strip = xmlString.substring(openBracketIdx, endTagIdx);
            System.out.println(" strip[" + strip + "]");
        }

        int idxStripStart = xmlString.indexOf(strip, openBracketIdx);
        int idxStripEnd = idxStripStart + (strip.length() - 1);
        System.out.println("idxStripStart[" + idxStripStart + "],idxStripEnd["
                + idxStripEnd + "]");
        System.out.println("xmlString.length[" + xmlString.length() + "]");

        // The tag element is not in the validTagList, so must be stripped
        if (!isValidTag) {
            // The element will be stripped, so need to set the idx to the
            // beginning of where this element was stripped
            nextIdxToProcess = idxStripStart;

            // now set the new xmlString
            xmlString = xmlString.replace(strip, empty);
        } else {
            // this element was not stripped, so need to set the idx to the end
            // + 1 of where this element is
            nextIdxToProcess = idxStripEnd + 1;
        }

        // determine if the nextIdxToProcess is past the length of the current
        // xml string, that means it will determine
        // if we are done processing the string yet
        keepLooking = nextIdxToProcess >= xmlString.length() ? false : true;

        return new HtmlStringInfo(xmlString, nextIdxToProcess, keepLooking);
    }
}

Wednesday, January 31, 2018

Java example on how to remove application invalid HTML tags

1 comment: