Environment:
Oracle Java 1.7.0_60
Example:
- The HtmlStringInfo class holds information about the html string that needs to be processed for any bad html tags
- The HTML being process must be well-formed. Void element is valid.
- The example states that <h1> through <h6> and <p> tags are valid tags. The following will keep all HTML that is in the <body> that is <h1> through <h6> or <p> only. It will delete any tag that is nested inside an invalid tag
- This does not currently remove any possible invalid elements nested inside valid elements. i.e. <h1><bad></bad></h1>
- The following is case-sensitive, so <h2></h2> is valid but <H2></H2> is not valid.
- Run using @Test method removeInvalidTagLoop
Sample HTML:
<div class=\"embedded\" id=\"thumbnail_0.jpeg\"></div><div class=\"package-entry\"><h1>HEADER</h1></div><h2>HEADER2</h2><p>PARAGRAPH</p>
After the call to removeInvalidHtmlTagElement finishes, the valid HTML that remains is:
<h2>HEADER2</h2><p>PARAGRAPH</p>
public class ValidHtmlTest {
private static final String openBracket = "<";
private static final String closeBracket = ">";
private static final String slash = "/";
private static final String space = " ";
private static final String empty = "";
/*
* Holds information about the html string that needs to be processed for
* any bad html tags.
*/
protected class HtmlStringInfo {
private String newString; // The updated string
private int nextIdxToProcess; // next index to start processing
private boolean keepLooking; // indicates if there’s other tags that’s
// left in the string that have not been
// processed yet
public HtmlStringInfo(String newString, int nextIdxToProcess,
boolean keepLooking) {
super();
this.newString = newString;
this.nextIdxToProcess = nextIdxToProcess;
this.keepLooking = keepLooking;
}
public String getNewString() {
return newString;
}
public int getNextIdxToProcess() {
return nextIdxToProcess;
}
public boolean isKeepLooking() {
return keepLooking;
}
@Override
public String toString() {
return "HtmlStringInfo [newString=" + newString
+ ", nextIdxToProcess=" + nextIdxToProcess
+ ", keepLooking=" + keepLooking + "]";
}
}
@Test
public void removeInvalidTagLoop() throws Exception {
// PREREQUISITE: Grab all XML content inside <body></body> tags, and start processing
String validTag[] = new String[] { "<h1", "<h2", "<h3", "<h4", "<h5",
"<h6", "<p" };
List<String> validTagList = Arrays.asList(validTag);
// Start processing XML inside <body></body> tags
String xmlString = "<div class=\"embedded\" id=\"thumbnail_0.jpeg\"></div><div class=\"package-entry\"><h1>HEADER</h1></div><h2>HEADER2</h2><p>PARAGRAPH</p>";
xmlString = removeInvalidHtmlTagElement(xmlString, validTagList);
System.out.println("Final[" + xmlString + "]");
}
/*
* This function removes invalid html tags (that are not specified in the
* validTagList), and anything in between the invalid element, from the
* xmlString. NOTE: This does not currently remove any possible invalid
* elements nested inside valid elements (i.e. <h1><bad></bad></h1>)
*/
protected String removeInvalidHtmlTagElement(String xmlString,
List<String> validTagList) {
HtmlStringInfo tagElementInfo = null;
// index keeps track of the index to START LOOKING for the next tag
// element to process
int nextIdxToProcess = 0;
do {
System.out.println("Start[" + xmlString + "]");
System.out.println("xmlString.length[" + xmlString.length() + "]");
// figure out entire contents inside the tag element to process
int closeBracketIdx = xmlString.indexOf(closeBracket,
nextIdxToProcess);
String insideBracket = xmlString.substring((nextIdxToProcess + 1),
closeBracketIdx);
System.out.println(" [" + insideBracket + "]");
tagElementInfo = getNextTagElement(xmlString, nextIdxToProcess,
insideBracket, validTagList);
xmlString = tagElementInfo.getNewString();
nextIdxToProcess = xmlString.indexOf(openBracket,
tagElementInfo.getNextIdxToProcess());
// break if we are done processing the string
if (nextIdxToProcess < 0)
break;
} while (tagElementInfo.isKeepLooking());
return xmlString;
}
protected HtmlStringInfo getNextTagElement(String xmlString,
int openBracketIdx, String insideBracket, List<String> validTagList) {
// if tag is valid then do not need to strip it out
boolean isValidTag = validTagList.contains(openBracket + insideBracket);
int nextIdxToProcess = 0;
boolean keepLooking = true;
boolean isVoidElement = insideBracket.contains(slash) ? true : false;
String strip = null;
if (isVoidElement) {
strip = openBracket + insideBracket + closeBracket;
System.out.println(" strip[" + strip + "]");
} else {
// get the element name, need to account for attributes within the
// element
int elementIdx = insideBracket.contains(space) ? insideBracket
.indexOf(space) : insideBracket.length();
String element = insideBracket.substring(0, elementIdx);
System.out.println(" [" + element + "]");
String endTag = openBracket + slash + element + closeBracket;
// need to start at the correct index, this accounts for repeats
int endTagIdx = xmlString.indexOf(endTag, openBracketIdx)
+ (endTag.length());
strip = xmlString.substring(openBracketIdx, endTagIdx);
System.out.println(" strip[" + strip + "]");
}
int idxStripStart = xmlString.indexOf(strip, openBracketIdx);
int idxStripEnd = idxStripStart + (strip.length() - 1);
System.out.println("idxStripStart[" + idxStripStart + "],idxStripEnd["
+ idxStripEnd + "]");
System.out.println("xmlString.length[" + xmlString.length() + "]");
// The tag element is not in the validTagList, so must be stripped
if (!isValidTag) {
// The element will be stripped, so need to set the idx to the
// beginning of where this element was stripped
nextIdxToProcess = idxStripStart;
// now set the new xmlString
xmlString = xmlString.replace(strip, empty);
} else {
// this element was not stripped, so need to set the idx to the end
// + 1 of where this element is
nextIdxToProcess = idxStripEnd + 1;
}
// determine if the nextIdxToProcess is past the length of the current
// xml string, that means it will determine
// if we are done processing the string yet
keepLooking = nextIdxToProcess >= xmlString.length() ? false : true;
return new HtmlStringInfo(xmlString, nextIdxToProcess, keepLooking);
}
}
thx!
ReplyDelete