From 6d06b332a70626d8d56e4316869a14631eeb4c41 Mon Sep 17 00:00:00 2001
From: cketti
Date: Sun, 30 Apr 2017 01:13:19 +0200
Subject: [PATCH] Use jsoup in HtmlSignatureRemover
---
.../helper/jsoup/AdvancedNodeTraversor.java | 139 +++++++++++++++++
.../com/fsck/k9/helper/jsoup/NodeFilter.java | 111 +++++++++++++
.../fsck/k9/message/html/HtmlProcessor.java | 2 +-
.../signature/HtmlSignatureRemover.java | 146 ++++++++++--------
.../signature/HtmlSignatureRemoverTest.java | 8 +-
5 files changed, 332 insertions(+), 74 deletions(-)
create mode 100644 k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java
create mode 100644 k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java
diff --git a/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java
new file mode 100644
index 000000000..cfcab37a4
--- /dev/null
+++ b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/AdvancedNodeTraversor.java
@@ -0,0 +1,139 @@
+/*
+ * The MIT License
+ *
+ * © 2009-2017, Jonathan Hedley
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package com.fsck.k9.helper.jsoup;
+
+
+import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision;
+import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.NodeTraversor;
+
+
+/**
+ * Depth-first node traversor.
+ *
+ * Based on {@link NodeTraversor}, but supports skipping sub trees, removing nodes, and stopping the traversal at any
+ * point.
+ *
+ * This is an enhancement of the jsoup pull request 'Improved node
+ * traversal' by Erich Schubert.
+ *
+ */
+public class AdvancedNodeTraversor {
+ /**
+ * Filter result.
+ */
+ public enum FilterResult {
+ /**
+ * Processing the tree was completed.
+ */
+ ENDED,
+ /**
+ * Processing was stopped.
+ */
+ STOPPED,
+ /**
+ * Processing the tree was completed and the root node was removed.
+ */
+ ROOT_REMOVED
+ }
+
+ private NodeFilter filter;
+
+ /**
+ * Create a new traversor.
+ *
+ * @param filter
+ * a class implementing the {@link NodeFilter} interface, to be called when visiting each node.
+ */
+ public AdvancedNodeTraversor(NodeFilter filter) {
+ this.filter = filter;
+ }
+
+ /**
+ * Start a depth-first filtering of the root and all of its descendants.
+ *
+ * @param root
+ * the root node point to traverse.
+ *
+ * @return The result of the filter operation.
+ */
+ public FilterResult filter(Node root) {
+ Node node = root;
+ int depth = 0;
+
+ while (node != null) {
+ HeadFilterDecision headResult = filter.head(node, depth);
+ if (headResult == HeadFilterDecision.STOP) {
+ return FilterResult.STOPPED;
+ }
+
+ if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) {
+ node = node.childNode(0);
+ ++depth;
+ continue;
+ }
+
+ TailFilterDecision tailResult = TailFilterDecision.CONTINUE;
+ while (node.nextSibling() == null && depth > 0) {
+ if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
+ tailResult = filter.tail(node, depth);
+ if (tailResult == TailFilterDecision.STOP) {
+ return FilterResult.STOPPED;
+ }
+ }
+
+ Node prev = node;
+ node = node.parentNode();
+ depth--;
+
+ if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
+ prev.remove();
+ }
+
+ headResult = HeadFilterDecision.CONTINUE;
+ }
+
+ if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
+ tailResult = filter.tail(node, depth);
+ if (tailResult == TailFilterDecision.STOP) {
+ return FilterResult.STOPPED;
+ }
+ }
+
+ Node prev = node;
+ node = node.nextSibling();
+
+ if (headResult == HeadFilterDecision.REMOVE) {
+ prev.remove();
+ }
+
+ if (prev == root) {
+ return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED;
+ }
+ }
+
+ return FilterResult.ENDED;
+ }
+}
diff --git a/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java
new file mode 100644
index 000000000..8fb22e7d5
--- /dev/null
+++ b/k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java
@@ -0,0 +1,111 @@
+package com.fsck.k9.helper.jsoup;
+
+
+import android.support.annotation.NonNull;
+
+import org.jsoup.nodes.Node;
+
+
+/**
+ * Node filter interface. Provide an implementing class to {@link AdvancedNodeTraversor} to iterate through
+ * nodes.
+ *
+ * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
+ * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
+ * create a start tag for a node, and tail to create the end tag.
+ *
+ *
+ * For every node, the filter has to decide in {@link NodeFilter#head(Node, int)}) whether to
+ *
+ * - continue ({@link HeadFilterDecision#CONTINUE}),
+ * - skip all children ({@link HeadFilterDecision#SKIP_CHILDREN}),
+ * - skip node entirely ({@link HeadFilterDecision#SKIP_ENTIRELY}),
+ * - remove the subtree ({@link HeadFilterDecision#REMOVE}),
+ * - interrupt the iteration and return ({@link HeadFilterDecision#STOP}).
+ *
+ *
+ * The difference between {@link HeadFilterDecision#SKIP_CHILDREN} and {@link HeadFilterDecision#SKIP_ENTIRELY} is that
+ * the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not.
+ *
+ *
+ * When {@link NodeFilter#tail(Node, int)} is called the filter has to decide whether to
+ *
+ * - continue ({@link TailFilterDecision#CONTINUE}),
+ * - remove the subtree ({@link TailFilterDecision#REMOVE}),
+ * - interrupt the iteration and return ({@link TailFilterDecision#STOP}).
+ *
+ *
+ */
+public interface NodeFilter {
+ /**
+ * Filter decision for {@link NodeFilter#head(Node, int)}.
+ */
+ enum HeadFilterDecision {
+ /**
+ * Continue processing the tree.
+ */
+ CONTINUE,
+ /**
+ * Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next.
+ */
+ SKIP_CHILDREN,
+ /**
+ * Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}.
+ */
+ SKIP_ENTIRELY,
+ /**
+ * Remove the node and its children, and do not call {@link NodeFilter#tail(Node, int)}.
+ */
+ REMOVE,
+ /**
+ * Stop processing.
+ */
+ STOP
+ }
+
+ /**
+ * Filter decision for {@link NodeFilter#tail(Node, int)}.
+ */
+ enum TailFilterDecision {
+ /**
+ * Continue processing the tree.
+ */
+ CONTINUE,
+ /**
+ * Remove the node and its children.
+ */
+ REMOVE,
+ /**
+ * Stop processing.
+ */
+ STOP
+ }
+
+ /**
+ * Callback for when a node is first visited.
+ *
+ * @param node
+ * the node being visited.
+ * @param depth
+ * the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+ * of that will have depth 1.
+ *
+ * @return Filter decision
+ */
+ @NonNull
+ HeadFilterDecision head(Node node, int depth);
+
+ /**
+ * Callback for when a node is last visited, after all of its descendants have been visited.
+ *
+ * @param node
+ * the node being visited.
+ * @param depth
+ * the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+ * of that will have depth 1.
+ *
+ * @return Filter decision
+ */
+ @NonNull
+ TailFilterDecision tail(Node node, int depth);
+}
diff --git a/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java b/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java
index eb0b7886b..a300bd471 100644
--- a/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java
+++ b/k9mail/src/main/java/com/fsck/k9/message/html/HtmlProcessor.java
@@ -30,7 +30,7 @@ public class HtmlProcessor {
HtmlConverter.cssStylePre());
}
- static String toCompactString(Document document) {
+ public static String toCompactString(Document document) {
document.outputSettings()
.prettyPrint(false)
.indentAmount(0);
diff --git a/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java b/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java
index f3af9cbcf..50a10f0d2 100644
--- a/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java
+++ b/k9mail/src/main/java/com/fsck/k9/message/signature/HtmlSignatureRemover.java
@@ -1,90 +1,100 @@
package com.fsck.k9.message.signature;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import timber.log.Timber;
+import android.support.annotation.NonNull;
-import com.fsck.k9.K9;
-import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.HtmlCleaner;
-import org.htmlcleaner.SimpleHtmlSerializer;
-import org.htmlcleaner.TagNode;
+import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor;
+import com.fsck.k9.helper.jsoup.NodeFilter;
+import com.fsck.k9.message.html.HtmlProcessor;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Tag;
public class HtmlSignatureRemover {
- private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("(
|\r?\n)--
", Pattern.CASE_INSENSITIVE);
- private static final Pattern BLOCKQUOTE_START = Pattern.compile("", Pattern.CASE_INSENSITIVE);
-
-
public static String stripSignature(String content) {
- Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
- if (dashSignatureHtml.find()) {
- Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
- Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
- List start = new ArrayList<>();
- List end = new ArrayList<>();
+ return new HtmlSignatureRemover().stripSignatureInternal(content);
+ }
- while (blockquoteStart.find()) {
- start.add(blockquoteStart.start());
+ private String stripSignatureInternal(String content) {
+ Document document = Jsoup.parse(content);
+
+ AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter());
+ nodeTraversor.filter(document.body());
+
+ return HtmlProcessor.toCompactString(document);
+ }
+
+
+ static class StripSignatureFilter implements NodeFilter {
+ private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE);
+ private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote");
+ private static final Tag BR = Tag.valueOf("br");
+ private static final Tag P = Tag.valueOf("p");
+
+
+ private boolean signatureFound = false;
+ private boolean lastElementCausedLineBreak = false;
+ private Element brElementPrecedingDashes;
+
+
+ @NonNull
+ @Override
+ public HeadFilterDecision head(Node node, int depth) {
+ if (signatureFound) {
+ return HeadFilterDecision.REMOVE;
}
- while (blockquoteEnd.find()) {
- end.add(blockquoteEnd.start());
- }
- if (start.size() != end.size()) {
- Timber.d("There are %d tags, but %d
tags. Refusing to strip.",
- start.size(), end.size());
- } else if (start.size() > 0) {
- // Ignore quoted signatures in blockquotes.
- dashSignatureHtml.region(0, start.get(0));
- if (dashSignatureHtml.find()) {
- // before first .
- content = content.substring(0, dashSignatureHtml.start());
- } else {
- for (int i = 0; i < start.size() - 1; i++) {
- // within blockquotes.
- if (end.get(i) < start.get(i + 1)) {
- dashSignatureHtml.region(end.get(i), start.get(i + 1));
- if (dashSignatureHtml.find()) {
- content = content.substring(0, dashSignatureHtml.start());
- break;
- }
- }
- }
- if (end.get(end.size() - 1) < content.length()) {
- // after last
.
- dashSignatureHtml.region(end.get(end.size() - 1), content.length());
- if (dashSignatureHtml.find()) {
- content = content.substring(0, dashSignatureHtml.start());
+
+ if (node instanceof Element) {
+ lastElementCausedLineBreak = false;
+
+ Element element = (Element) node;
+ if (element.tag().equals(BLOCKQUOTE)) {
+ return HeadFilterDecision.SKIP_ENTIRELY;
+ }
+ } else if (node instanceof TextNode) {
+ TextNode textNode = (TextNode) node;
+ if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
+ Node nextNode = node.nextSibling();
+ if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
+ signatureFound = true;
+ if (brElementPrecedingDashes != null) {
+ brElementPrecedingDashes.remove();
+ brElementPrecedingDashes = null;
}
+
+ return HeadFilterDecision.REMOVE;
}
}
- } else {
- // No blockquotes found.
- content = content.substring(0, dashSignatureHtml.start());
}
+
+ return HeadFilterDecision.CONTINUE;
}
- // Fix the stripping off of closing tags if a signature was stripped,
- // as well as clean up the HTML of the quoted message.
- HtmlCleaner cleaner = new HtmlCleaner();
- CleanerProperties properties = cleaner.getProperties();
+ @NonNull
+ @Override
+ public TailFilterDecision tail(Node node, int depth) {
+ if (signatureFound) {
+ return TailFilterDecision.CONTINUE;
+ }
- // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
- properties.setNamespacesAware(false);
- properties.setAdvancedXmlEscape(false);
- properties.setOmitXmlDeclaration(true);
- properties.setOmitDoctypeDeclaration(false);
- properties.setTranslateSpecialEntities(false);
- properties.setRecognizeUnicodeChars(false);
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ boolean elementIsBr = element.tag().equals(BR);
+ if (elementIsBr || element.tag().equals(P)) {
+ lastElementCausedLineBreak = true;
+ brElementPrecedingDashes = elementIsBr ? element : null;
+ return TailFilterDecision.CONTINUE;
+ }
+ }
- TagNode node = cleaner.clean(content);
- SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
- content = htmlSerialized.getAsString(node, "UTF8");
- return content;
+ lastElementCausedLineBreak = false;
+ return TailFilterDecision.CONTINUE;
+ }
}
}
diff --git a/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java b/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java
index 69d0c8614..89d42f099 100644
--- a/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java
+++ b/k9mail/src/test/java/com/fsck/k9/message/signature/HtmlSignatureRemoverTest.java
@@ -3,7 +3,6 @@ package com.fsck.k9.message.signature;
import com.fsck.k9.K9RobolectricTestRunner;
-import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.annotation.Config;
@@ -27,7 +26,6 @@ public class HtmlSignatureRemoverTest {
assertEquals("This is the body text", extractText(withoutSignature));
}
- @Ignore
@Test
public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception {
String html = "\r\n" +
@@ -88,8 +86,8 @@ public class HtmlSignatureRemoverTest {
assertEquals("" +
"" +
"This is some quoted text" +
- "
" +
- "--
" +
+ "
" +
+ "--
" +
"Inner signature" +
"
" +
"This is the body text
" +
@@ -141,7 +139,7 @@ public class HtmlSignatureRemoverTest {
String withoutSignature = HtmlSignatureRemover.stripSignature(html);
assertEquals("" +
- "This is the body text
" +
+ "This is the body text
" +
"Some quote
" +
"",
withoutSignature);