From 6d06b332a70626d8d56e4316869a14631eeb4c41 Mon Sep 17 00:00:00 2001 From: cketti Date: Sun, 30 Apr 2017 01:13:19 +0200 Subject: [PATCH] Use jsoup in HtmlSignatureRemover --- .../helper/jsoup/AdvancedNodeTraversor.java | 139 +++++++++++++++++ .../com/fsck/k9/helper/jsoup/NodeFilter.java | 111 +++++++++++++ .../fsck/k9/message/html/HtmlProcessor.java | 2 +- .../signature/HtmlSignatureRemover.java | 146 ++++++++++-------- .../signature/HtmlSignatureRemoverTest.java | 8 +- 5 files changed, 332 insertions(+), 74 deletions(-) create mode 100644 k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java create mode 100644 k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java diff --git a/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java new file mode 100644 index 000000000..cfcab37a4 --- /dev/null +++ b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java @@ -0,0 +1,139 @@ +/* + * The MIT License + * + * © 2009-2017, Jonathan Hedley + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package com.fsck.k9.helper.jsoup; + + +import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision; +import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeTraversor; + + +/** + * Depth-first node traversor. + *

+ * Based on {@link NodeTraversor}, but supports skipping sub trees, removing nodes, and stopping the traversal at any + * point. + *

+ * This is an enhancement of the jsoup pull request 'Improved node + * traversal' by Erich Schubert. + *

+ */ +public class AdvancedNodeTraversor { + /** + * Filter result. + */ + public enum FilterResult { + /** + * Processing the tree was completed. + */ + ENDED, + /** + * Processing was stopped. + */ + STOPPED, + /** + * Processing the tree was completed and the root node was removed. + */ + ROOT_REMOVED + } + + private NodeFilter filter; + + /** + * Create a new traversor. + * + * @param filter + * a class implementing the {@link NodeFilter} interface, to be called when visiting each node. + */ + public AdvancedNodeTraversor(NodeFilter filter) { + this.filter = filter; + } + + /** + * Start a depth-first filtering of the root and all of its descendants. + * + * @param root + * the root node point to traverse. + * + * @return The result of the filter operation. + */ + public FilterResult filter(Node root) { + Node node = root; + int depth = 0; + + while (node != null) { + HeadFilterDecision headResult = filter.head(node, depth); + if (headResult == HeadFilterDecision.STOP) { + return FilterResult.STOPPED; + } + + if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) { + node = node.childNode(0); + ++depth; + continue; + } + + TailFilterDecision tailResult = TailFilterDecision.CONTINUE; + while (node.nextSibling() == null && depth > 0) { + if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) { + tailResult = filter.tail(node, depth); + if (tailResult == TailFilterDecision.STOP) { + return FilterResult.STOPPED; + } + } + + Node prev = node; + node = node.parentNode(); + depth--; + + if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) { + prev.remove(); + } + + headResult = HeadFilterDecision.CONTINUE; + } + + if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) { + tailResult = filter.tail(node, depth); + if (tailResult == TailFilterDecision.STOP) { + return FilterResult.STOPPED; + } + } + + Node prev = node; + node = node.nextSibling(); + + if (headResult == HeadFilterDecision.REMOVE) { + prev.remove(); + } + + if (prev == root) { + return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED; + } + } + + return FilterResult.ENDED; + } +} diff --git a/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java new file mode 100644 index 000000000..8fb22e7d5 --- /dev/null +++ b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java @@ -0,0 +1,111 @@ +package com.fsck.k9.helper.jsoup; + + +import android.support.annotation.NonNull; + +import org.jsoup.nodes.Node; + + +/** + * Node filter interface. Provide an implementing class to {@link AdvancedNodeTraversor} to iterate through + * nodes. + *

+ * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first + * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to + * create a start tag for a node, and tail to create the end tag. + *

+ *

+ * For every node, the filter has to decide in {@link NodeFilter#head(Node, int)}) whether to + *

+ *

+ * The difference between {@link HeadFilterDecision#SKIP_CHILDREN} and {@link HeadFilterDecision#SKIP_ENTIRELY} is that + * the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not. + *

+ *

+ * When {@link NodeFilter#tail(Node, int)} is called the filter has to decide whether to + *

+ *

+ */ +public interface NodeFilter { + /** + * Filter decision for {@link NodeFilter#head(Node, int)}. + */ + enum HeadFilterDecision { + /** + * Continue processing the tree. + */ + CONTINUE, + /** + * Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next. + */ + SKIP_CHILDREN, + /** + * Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}. + */ + SKIP_ENTIRELY, + /** + * Remove the node and its children, and do not call {@link NodeFilter#tail(Node, int)}. + */ + REMOVE, + /** + * Stop processing. + */ + STOP + } + + /** + * Filter decision for {@link NodeFilter#tail(Node, int)}. + */ + enum TailFilterDecision { + /** + * Continue processing the tree. + */ + CONTINUE, + /** + * Remove the node and its children. + */ + REMOVE, + /** + * Stop processing. + */ + STOP + } + + /** + * Callback for when a node is first visited. + * + * @param node + * the node being visited. + * @param depth + * the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node + * of that will have depth 1. + * + * @return Filter decision + */ + @NonNull + HeadFilterDecision head(Node node, int depth); + + /** + * Callback for when a node is last visited, after all of its descendants have been visited. + * + * @param node + * the node being visited. + * @param depth + * the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node + * of that will have depth 1. + * + * @return Filter decision + */ + @NonNull + TailFilterDecision tail(Node node, int depth); +} diff --git a/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java b/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java index eb0b7886b..a300bd471 100644 --- a/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java +++ b/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java @@ -30,7 +30,7 @@ public class HtmlProcessor { HtmlConverter.cssStylePre()); } - static String toCompactString(Document document) { + public static String toCompactString(Document document) { document.outputSettings() .prettyPrint(false) .indentAmount(0); diff --git a/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java b/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java index f3af9cbcf..50a10f0d2 100644 --- a/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java +++ b/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java @@ -1,90 +1,100 @@ package com.fsck.k9.message.signature; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; import java.util.regex.Pattern; -import timber.log.Timber; +import android.support.annotation.NonNull; -import com.fsck.k9.K9; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.SimpleHtmlSerializer; -import org.htmlcleaner.TagNode; +import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor; +import com.fsck.k9.helper.jsoup.NodeFilter; +import com.fsck.k9.message.html.HtmlProcessor; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.parser.Tag; public class HtmlSignatureRemover { - private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("(|\r?\n)-- ", Pattern.CASE_INSENSITIVE); - private static final Pattern BLOCKQUOTE_START = Pattern.compile("", Pattern.CASE_INSENSITIVE); - - public static String stripSignature(String content) { - Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content); - if (dashSignatureHtml.find()) { - Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content); - Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content); - List start = new ArrayList<>(); - List end = new ArrayList<>(); + return new HtmlSignatureRemover().stripSignatureInternal(content); + } - while (blockquoteStart.find()) { - start.add(blockquoteStart.start()); + private String stripSignatureInternal(String content) { + Document document = Jsoup.parse(content); + + AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter()); + nodeTraversor.filter(document.body()); + + return HtmlProcessor.toCompactString(document); + } + + + static class StripSignatureFilter implements NodeFilter { + private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE); + private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote"); + private static final Tag BR = Tag.valueOf("br"); + private static final Tag P = Tag.valueOf("p"); + + + private boolean signatureFound = false; + private boolean lastElementCausedLineBreak = false; + private Element brElementPrecedingDashes; + + + @NonNull + @Override + public HeadFilterDecision head(Node node, int depth) { + if (signatureFound) { + return HeadFilterDecision.REMOVE; } - while (blockquoteEnd.find()) { - end.add(blockquoteEnd.start()); - } - if (start.size() != end.size()) { - Timber.d("There are %d
tags, but %d
tags. Refusing to strip.", - start.size(), end.size()); - } else if (start.size() > 0) { - // Ignore quoted signatures in blockquotes. - dashSignatureHtml.region(0, start.get(0)); - if (dashSignatureHtml.find()) { - // before first
. - content = content.substring(0, dashSignatureHtml.start()); - } else { - for (int i = 0; i < start.size() - 1; i++) { - // within blockquotes. - if (end.get(i) < start.get(i + 1)) { - dashSignatureHtml.region(end.get(i), start.get(i + 1)); - if (dashSignatureHtml.find()) { - content = content.substring(0, dashSignatureHtml.start()); - break; - } - } - } - if (end.get(end.size() - 1) < content.length()) { - // after last
. - dashSignatureHtml.region(end.get(end.size() - 1), content.length()); - if (dashSignatureHtml.find()) { - content = content.substring(0, dashSignatureHtml.start()); + + if (node instanceof Element) { + lastElementCausedLineBreak = false; + + Element element = (Element) node; + if (element.tag().equals(BLOCKQUOTE)) { + return HeadFilterDecision.SKIP_ENTIRELY; + } + } else if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) { + Node nextNode = node.nextSibling(); + if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) { + signatureFound = true; + if (brElementPrecedingDashes != null) { + brElementPrecedingDashes.remove(); + brElementPrecedingDashes = null; } + + return HeadFilterDecision.REMOVE; } } - } else { - // No blockquotes found. - content = content.substring(0, dashSignatureHtml.start()); } + + return HeadFilterDecision.CONTINUE; } - // Fix the stripping off of closing tags if a signature was stripped, - // as well as clean up the HTML of the quoted message. - HtmlCleaner cleaner = new HtmlCleaner(); - CleanerProperties properties = cleaner.getProperties(); + @NonNull + @Override + public TailFilterDecision tail(Node node, int depth) { + if (signatureFound) { + return TailFilterDecision.CONTINUE; + } - // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions - properties.setNamespacesAware(false); - properties.setAdvancedXmlEscape(false); - properties.setOmitXmlDeclaration(true); - properties.setOmitDoctypeDeclaration(false); - properties.setTranslateSpecialEntities(false); - properties.setRecognizeUnicodeChars(false); + if (node instanceof Element) { + Element element = (Element) node; + boolean elementIsBr = element.tag().equals(BR); + if (elementIsBr || element.tag().equals(P)) { + lastElementCausedLineBreak = true; + brElementPrecedingDashes = elementIsBr ? element : null; + return TailFilterDecision.CONTINUE; + } + } - TagNode node = cleaner.clean(content); - SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties); - content = htmlSerialized.getAsString(node, "UTF8"); - return content; + lastElementCausedLineBreak = false; + return TailFilterDecision.CONTINUE; + } } } diff --git a/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java b/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java index 69d0c8614..89d42f099 100644 --- a/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java +++ b/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java @@ -3,7 +3,6 @@ package com.fsck.k9.message.signature; import com.fsck.k9.K9RobolectricTestRunner; -import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.robolectric.annotation.Config; @@ -27,7 +26,6 @@ public class HtmlSignatureRemoverTest { assertEquals("This is the body text", extractText(withoutSignature)); } - @Ignore @Test public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception { String html = "\r\n" + @@ -88,8 +86,8 @@ public class HtmlSignatureRemoverTest { assertEquals("" + "
" + "This is some quoted text" + - "
" + - "--
" + + "
" + + "--
" + "Inner signature" + "
" + "
This is the body text
" + @@ -141,7 +139,7 @@ public class HtmlSignatureRemoverTest { String withoutSignature = HtmlSignatureRemover.stripSignature(html); assertEquals("" + - "This is the body text
" + + "This is the body text
" + "
Some quote
" + "", withoutSignature);