Use jsoup in HtmlSignatureRemover

This commit is contained in:
cketti 2017-04-30 01:13:19 +02:00
parent 268189c1b0
commit 6d06b332a7
5 changed files with 332 additions and 74 deletions

View file

@ -0,0 +1,139 @@
/*
* The MIT License
*
* © 2009-2017, Jonathan Hedley <jonathan@hedley.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.fsck.k9.helper.jsoup;
import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision;
import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeTraversor;
/**
* Depth-first node traversor.
* <p>
* Based on {@link NodeTraversor}, but supports skipping sub trees, removing nodes, and stopping the traversal at any
* point.
* </p><p>
* This is an enhancement of the <a href="https://github.com/jhy/jsoup/pull/849">jsoup pull request 'Improved node
* traversal'</a> by <a href="https://github.com/kno10">Erich Schubert</a>.
* </p>
*/
public class AdvancedNodeTraversor {
/**
* Filter result.
*/
public enum FilterResult {
/**
* Processing the tree was completed.
*/
ENDED,
/**
* Processing was stopped.
*/
STOPPED,
/**
* Processing the tree was completed and the root node was removed.
*/
ROOT_REMOVED
}
private NodeFilter filter;
/**
* Create a new traversor.
*
* @param filter
* a class implementing the {@link NodeFilter} interface, to be called when visiting each node.
*/
public AdvancedNodeTraversor(NodeFilter filter) {
this.filter = filter;
}
/**
* Start a depth-first filtering of the root and all of its descendants.
*
* @param root
* the root node point to traverse.
*
* @return The result of the filter operation.
*/
public FilterResult filter(Node root) {
Node node = root;
int depth = 0;
while (node != null) {
HeadFilterDecision headResult = filter.head(node, depth);
if (headResult == HeadFilterDecision.STOP) {
return FilterResult.STOPPED;
}
if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) {
node = node.childNode(0);
++depth;
continue;
}
TailFilterDecision tailResult = TailFilterDecision.CONTINUE;
while (node.nextSibling() == null && depth > 0) {
if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
tailResult = filter.tail(node, depth);
if (tailResult == TailFilterDecision.STOP) {
return FilterResult.STOPPED;
}
}
Node prev = node;
node = node.parentNode();
depth--;
if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
prev.remove();
}
headResult = HeadFilterDecision.CONTINUE;
}
if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
tailResult = filter.tail(node, depth);
if (tailResult == TailFilterDecision.STOP) {
return FilterResult.STOPPED;
}
}
Node prev = node;
node = node.nextSibling();
if (headResult == HeadFilterDecision.REMOVE) {
prev.remove();
}
if (prev == root) {
return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED;
}
}
return FilterResult.ENDED;
}
}

View file

@ -0,0 +1,111 @@
package com.fsck.k9.helper.jsoup;
import android.support.annotation.NonNull;
import org.jsoup.nodes.Node;
/**
* Node filter interface. Provide an implementing class to {@link AdvancedNodeTraversor} to iterate through
* nodes.
* <p>
* This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
* seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
* create a start tag for a node, and tail to create the end tag.
* </p>
* <p>
* For every node, the filter has to decide in {@link NodeFilter#head(Node, int)}) whether to
* <ul>
* <li>continue ({@link HeadFilterDecision#CONTINUE}),</li>
* <li>skip all children ({@link HeadFilterDecision#SKIP_CHILDREN}),</li>
* <li>skip node entirely ({@link HeadFilterDecision#SKIP_ENTIRELY}),</li>
* <li>remove the subtree ({@link HeadFilterDecision#REMOVE}),</li>
* <li>interrupt the iteration and return ({@link HeadFilterDecision#STOP}).</li>
* </ul>
* <p>
* The difference between {@link HeadFilterDecision#SKIP_CHILDREN} and {@link HeadFilterDecision#SKIP_ENTIRELY} is that
* the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not.
* </p>
* <p>
* When {@link NodeFilter#tail(Node, int)} is called the filter has to decide whether to
* <ul>
* <li>continue ({@link TailFilterDecision#CONTINUE}),</li>
* <li>remove the subtree ({@link TailFilterDecision#REMOVE}),</li>
* <li>interrupt the iteration and return ({@link TailFilterDecision#STOP}).</li>
* </ul>
* </p>
*/
public interface NodeFilter {
/**
* Filter decision for {@link NodeFilter#head(Node, int)}.
*/
enum HeadFilterDecision {
/**
* Continue processing the tree.
*/
CONTINUE,
/**
* Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next.
*/
SKIP_CHILDREN,
/**
* Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}.
*/
SKIP_ENTIRELY,
/**
* Remove the node and its children, and do not call {@link NodeFilter#tail(Node, int)}.
*/
REMOVE,
/**
* Stop processing.
*/
STOP
}
/**
* Filter decision for {@link NodeFilter#tail(Node, int)}.
*/
enum TailFilterDecision {
/**
* Continue processing the tree.
*/
CONTINUE,
/**
* Remove the node and its children.
*/
REMOVE,
/**
* Stop processing.
*/
STOP
}
/**
* Callback for when a node is first visited.
*
* @param node
* the node being visited.
* @param depth
* the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
*
* @return Filter decision
*/
@NonNull
HeadFilterDecision head(Node node, int depth);
/**
* Callback for when a node is last visited, after all of its descendants have been visited.
*
* @param node
* the node being visited.
* @param depth
* the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
*
* @return Filter decision
*/
@NonNull
TailFilterDecision tail(Node node, int depth);
}

View file

@ -30,7 +30,7 @@ public class HtmlProcessor {
HtmlConverter.cssStylePre());
}
static String toCompactString(Document document) {
public static String toCompactString(Document document) {
document.outputSettings()
.prettyPrint(false)
.indentAmount(0);

View file

@ -1,90 +1,100 @@
package com.fsck.k9.message.signature;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import timber.log.Timber;
import android.support.annotation.NonNull;
import com.fsck.k9.K9;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor;
import com.fsck.k9.helper.jsoup.NodeFilter;
import com.fsck.k9.message.html.HtmlProcessor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
public class HtmlSignatureRemover {
private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("(<br( /)?>|\r?\n)-- <br( /)?>", Pattern.CASE_INSENSITIVE);
private static final Pattern BLOCKQUOTE_START = Pattern.compile("<blockquote", Pattern.CASE_INSENSITIVE);
private static final Pattern BLOCKQUOTE_END = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
public static String stripSignature(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
return new HtmlSignatureRemover().stripSignatureInternal(content);
}
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
private String stripSignatureInternal(String content) {
Document document = Jsoup.parse(content);
AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter());
nodeTraversor.filter(document.body());
return HtmlProcessor.toCompactString(document);
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
static class StripSignatureFilter implements NodeFilter {
private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE);
private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote");
private static final Tag BR = Tag.valueOf("br");
private static final Tag P = Tag.valueOf("p");
private boolean signatureFound = false;
private boolean lastElementCausedLineBreak = false;
private Element brElementPrecedingDashes;
@NonNull
@Override
public HeadFilterDecision head(Node node, int depth) {
if (signatureFound) {
return HeadFilterDecision.REMOVE;
}
if (start.size() != end.size()) {
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.",
start.size(), end.size());
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
if (node instanceof Element) {
lastElementCausedLineBreak = false;
Element element = (Element) node;
if (element.tag().equals(BLOCKQUOTE)) {
return HeadFilterDecision.SKIP_ENTIRELY;
}
} else if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
Node nextNode = node.nextSibling();
if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
signatureFound = true;
if (brElementPrecedingDashes != null) {
brElementPrecedingDashes.remove();
brElementPrecedingDashes = null;
}
return HeadFilterDecision.REMOVE;
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
return HeadFilterDecision.CONTINUE;
}
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
@NonNull
@Override
public TailFilterDecision tail(Node node, int depth) {
if (signatureFound) {
return TailFilterDecision.CONTINUE;
}
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
if (node instanceof Element) {
Element element = (Element) node;
boolean elementIsBr = element.tag().equals(BR);
if (elementIsBr || element.tag().equals(P)) {
lastElementCausedLineBreak = true;
brElementPrecedingDashes = elementIsBr ? element : null;
return TailFilterDecision.CONTINUE;
}
}
lastElementCausedLineBreak = false;
return TailFilterDecision.CONTINUE;
}
}
}

View file

@ -3,7 +3,6 @@ package com.fsck.k9.message.signature;
import com.fsck.k9.K9RobolectricTestRunner;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.annotation.Config;
@ -27,7 +26,6 @@ public class HtmlSignatureRemoverTest {
assertEquals("This is the body text", extractText(withoutSignature));
}
@Ignore
@Test
public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception {
String html = "<html>\r\n" +
@ -88,8 +86,8 @@ public class HtmlSignatureRemoverTest {
assertEquals("<html><head></head><body>" +
"<blockquote>" +
"This is some quoted text" +
"<br />" +
"-- <br />" +
"<br>" +
"-- <br>" +
"Inner signature" +
"</blockquote>" +
"<div>This is the body text</div>" +
@ -141,7 +139,7 @@ public class HtmlSignatureRemoverTest {
String withoutSignature = HtmlSignatureRemover.stripSignature(html);
assertEquals("<html><head></head><body>" +
"This is the body text<br />" +
"This is the body text<br>" +
"<blockquote>Some quote</blockquote>" +
"</body></html>",
withoutSignature);