Use jsoup in HtmlSignatureRemover
This commit is contained in:
parent
268189c1b0
commit
6d06b332a7
5 changed files with 332 additions and 74 deletions
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* © 2009-2017, Jonathan Hedley <jonathan@hedley.net>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package com.fsck.k9.helper.jsoup;
|
||||
|
||||
|
||||
import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision;
|
||||
import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeTraversor;
|
||||
|
||||
|
||||
/**
|
||||
* Depth-first node traversor.
|
||||
* <p>
|
||||
* Based on {@link NodeTraversor}, but supports skipping sub trees, removing nodes, and stopping the traversal at any
|
||||
* point.
|
||||
* </p><p>
|
||||
* This is an enhancement of the <a href="https://github.com/jhy/jsoup/pull/849">jsoup pull request 'Improved node
|
||||
* traversal'</a> by <a href="https://github.com/kno10">Erich Schubert</a>.
|
||||
* </p>
|
||||
*/
|
||||
public class AdvancedNodeTraversor {
|
||||
/**
|
||||
* Filter result.
|
||||
*/
|
||||
public enum FilterResult {
|
||||
/**
|
||||
* Processing the tree was completed.
|
||||
*/
|
||||
ENDED,
|
||||
/**
|
||||
* Processing was stopped.
|
||||
*/
|
||||
STOPPED,
|
||||
/**
|
||||
* Processing the tree was completed and the root node was removed.
|
||||
*/
|
||||
ROOT_REMOVED
|
||||
}
|
||||
|
||||
private NodeFilter filter;
|
||||
|
||||
/**
|
||||
* Create a new traversor.
|
||||
*
|
||||
* @param filter
|
||||
* a class implementing the {@link NodeFilter} interface, to be called when visiting each node.
|
||||
*/
|
||||
public AdvancedNodeTraversor(NodeFilter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a depth-first filtering of the root and all of its descendants.
|
||||
*
|
||||
* @param root
|
||||
* the root node point to traverse.
|
||||
*
|
||||
* @return The result of the filter operation.
|
||||
*/
|
||||
public FilterResult filter(Node root) {
|
||||
Node node = root;
|
||||
int depth = 0;
|
||||
|
||||
while (node != null) {
|
||||
HeadFilterDecision headResult = filter.head(node, depth);
|
||||
if (headResult == HeadFilterDecision.STOP) {
|
||||
return FilterResult.STOPPED;
|
||||
}
|
||||
|
||||
if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) {
|
||||
node = node.childNode(0);
|
||||
++depth;
|
||||
continue;
|
||||
}
|
||||
|
||||
TailFilterDecision tailResult = TailFilterDecision.CONTINUE;
|
||||
while (node.nextSibling() == null && depth > 0) {
|
||||
if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
|
||||
tailResult = filter.tail(node, depth);
|
||||
if (tailResult == TailFilterDecision.STOP) {
|
||||
return FilterResult.STOPPED;
|
||||
}
|
||||
}
|
||||
|
||||
Node prev = node;
|
||||
node = node.parentNode();
|
||||
depth--;
|
||||
|
||||
if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
|
||||
prev.remove();
|
||||
}
|
||||
|
||||
headResult = HeadFilterDecision.CONTINUE;
|
||||
}
|
||||
|
||||
if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
|
||||
tailResult = filter.tail(node, depth);
|
||||
if (tailResult == TailFilterDecision.STOP) {
|
||||
return FilterResult.STOPPED;
|
||||
}
|
||||
}
|
||||
|
||||
Node prev = node;
|
||||
node = node.nextSibling();
|
||||
|
||||
if (headResult == HeadFilterDecision.REMOVE) {
|
||||
prev.remove();
|
||||
}
|
||||
|
||||
if (prev == root) {
|
||||
return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED;
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.ENDED;
|
||||
}
|
||||
}
|
111
k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java
Normal file
111
k9mail/src/main/java/com/fsck/k9/helper/jsoup/NodeFilter.java
Normal file
|
@ -0,0 +1,111 @@
|
|||
package com.fsck.k9.helper.jsoup;
|
||||
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
import org.jsoup.nodes.Node;
|
||||
|
||||
|
||||
/**
|
||||
* Node filter interface. Provide an implementing class to {@link AdvancedNodeTraversor} to iterate through
|
||||
* nodes.
|
||||
* <p>
|
||||
* This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
|
||||
* seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
|
||||
* create a start tag for a node, and tail to create the end tag.
|
||||
* </p>
|
||||
* <p>
|
||||
* For every node, the filter has to decide in {@link NodeFilter#head(Node, int)}) whether to
|
||||
* <ul>
|
||||
* <li>continue ({@link HeadFilterDecision#CONTINUE}),</li>
|
||||
* <li>skip all children ({@link HeadFilterDecision#SKIP_CHILDREN}),</li>
|
||||
* <li>skip node entirely ({@link HeadFilterDecision#SKIP_ENTIRELY}),</li>
|
||||
* <li>remove the subtree ({@link HeadFilterDecision#REMOVE}),</li>
|
||||
* <li>interrupt the iteration and return ({@link HeadFilterDecision#STOP}).</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* The difference between {@link HeadFilterDecision#SKIP_CHILDREN} and {@link HeadFilterDecision#SKIP_ENTIRELY} is that
|
||||
* the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not.
|
||||
* </p>
|
||||
* <p>
|
||||
* When {@link NodeFilter#tail(Node, int)} is called the filter has to decide whether to
|
||||
* <ul>
|
||||
* <li>continue ({@link TailFilterDecision#CONTINUE}),</li>
|
||||
* <li>remove the subtree ({@link TailFilterDecision#REMOVE}),</li>
|
||||
* <li>interrupt the iteration and return ({@link TailFilterDecision#STOP}).</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public interface NodeFilter {
|
||||
/**
|
||||
* Filter decision for {@link NodeFilter#head(Node, int)}.
|
||||
*/
|
||||
enum HeadFilterDecision {
|
||||
/**
|
||||
* Continue processing the tree.
|
||||
*/
|
||||
CONTINUE,
|
||||
/**
|
||||
* Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next.
|
||||
*/
|
||||
SKIP_CHILDREN,
|
||||
/**
|
||||
* Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}.
|
||||
*/
|
||||
SKIP_ENTIRELY,
|
||||
/**
|
||||
* Remove the node and its children, and do not call {@link NodeFilter#tail(Node, int)}.
|
||||
*/
|
||||
REMOVE,
|
||||
/**
|
||||
* Stop processing.
|
||||
*/
|
||||
STOP
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter decision for {@link NodeFilter#tail(Node, int)}.
|
||||
*/
|
||||
enum TailFilterDecision {
|
||||
/**
|
||||
* Continue processing the tree.
|
||||
*/
|
||||
CONTINUE,
|
||||
/**
|
||||
* Remove the node and its children.
|
||||
*/
|
||||
REMOVE,
|
||||
/**
|
||||
* Stop processing.
|
||||
*/
|
||||
STOP
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback for when a node is first visited.
|
||||
*
|
||||
* @param node
|
||||
* the node being visited.
|
||||
* @param depth
|
||||
* the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
|
||||
* of that will have depth 1.
|
||||
*
|
||||
* @return Filter decision
|
||||
*/
|
||||
@NonNull
|
||||
HeadFilterDecision head(Node node, int depth);
|
||||
|
||||
/**
|
||||
* Callback for when a node is last visited, after all of its descendants have been visited.
|
||||
*
|
||||
* @param node
|
||||
* the node being visited.
|
||||
* @param depth
|
||||
* the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
|
||||
* of that will have depth 1.
|
||||
*
|
||||
* @return Filter decision
|
||||
*/
|
||||
@NonNull
|
||||
TailFilterDecision tail(Node node, int depth);
|
||||
}
|
|
@ -30,7 +30,7 @@ public class HtmlProcessor {
|
|||
HtmlConverter.cssStylePre());
|
||||
}
|
||||
|
||||
static String toCompactString(Document document) {
|
||||
public static String toCompactString(Document document) {
|
||||
document.outputSettings()
|
||||
.prettyPrint(false)
|
||||
.indentAmount(0);
|
||||
|
|
|
@ -1,90 +1,100 @@
|
|||
package com.fsck.k9.message.signature;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import timber.log.Timber;
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
import com.fsck.k9.K9;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.SimpleHtmlSerializer;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor;
|
||||
import com.fsck.k9.helper.jsoup.NodeFilter;
|
||||
import com.fsck.k9.message.html.HtmlProcessor;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.parser.Tag;
|
||||
|
||||
|
||||
public class HtmlSignatureRemover {
|
||||
private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("(<br( /)?>|\r?\n)-- <br( /)?>", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern BLOCKQUOTE_START = Pattern.compile("<blockquote", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern BLOCKQUOTE_END = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
public static String stripSignature(String content) {
|
||||
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
|
||||
if (dashSignatureHtml.find()) {
|
||||
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
|
||||
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
|
||||
List<Integer> start = new ArrayList<>();
|
||||
List<Integer> end = new ArrayList<>();
|
||||
return new HtmlSignatureRemover().stripSignatureInternal(content);
|
||||
}
|
||||
|
||||
while (blockquoteStart.find()) {
|
||||
start.add(blockquoteStart.start());
|
||||
private String stripSignatureInternal(String content) {
|
||||
Document document = Jsoup.parse(content);
|
||||
|
||||
AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter());
|
||||
nodeTraversor.filter(document.body());
|
||||
|
||||
return HtmlProcessor.toCompactString(document);
|
||||
}
|
||||
while (blockquoteEnd.find()) {
|
||||
end.add(blockquoteEnd.start());
|
||||
|
||||
|
||||
static class StripSignatureFilter implements NodeFilter {
|
||||
private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE);
|
||||
private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote");
|
||||
private static final Tag BR = Tag.valueOf("br");
|
||||
private static final Tag P = Tag.valueOf("p");
|
||||
|
||||
|
||||
private boolean signatureFound = false;
|
||||
private boolean lastElementCausedLineBreak = false;
|
||||
private Element brElementPrecedingDashes;
|
||||
|
||||
|
||||
@NonNull
|
||||
@Override
|
||||
public HeadFilterDecision head(Node node, int depth) {
|
||||
if (signatureFound) {
|
||||
return HeadFilterDecision.REMOVE;
|
||||
}
|
||||
if (start.size() != end.size()) {
|
||||
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.",
|
||||
start.size(), end.size());
|
||||
} else if (start.size() > 0) {
|
||||
// Ignore quoted signatures in blockquotes.
|
||||
dashSignatureHtml.region(0, start.get(0));
|
||||
if (dashSignatureHtml.find()) {
|
||||
// before first <blockquote>.
|
||||
content = content.substring(0, dashSignatureHtml.start());
|
||||
} else {
|
||||
for (int i = 0; i < start.size() - 1; i++) {
|
||||
// within blockquotes.
|
||||
if (end.get(i) < start.get(i + 1)) {
|
||||
dashSignatureHtml.region(end.get(i), start.get(i + 1));
|
||||
if (dashSignatureHtml.find()) {
|
||||
content = content.substring(0, dashSignatureHtml.start());
|
||||
break;
|
||||
|
||||
if (node instanceof Element) {
|
||||
lastElementCausedLineBreak = false;
|
||||
|
||||
Element element = (Element) node;
|
||||
if (element.tag().equals(BLOCKQUOTE)) {
|
||||
return HeadFilterDecision.SKIP_ENTIRELY;
|
||||
}
|
||||
} else if (node instanceof TextNode) {
|
||||
TextNode textNode = (TextNode) node;
|
||||
if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
|
||||
Node nextNode = node.nextSibling();
|
||||
if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
|
||||
signatureFound = true;
|
||||
if (brElementPrecedingDashes != null) {
|
||||
brElementPrecedingDashes.remove();
|
||||
brElementPrecedingDashes = null;
|
||||
}
|
||||
|
||||
return HeadFilterDecision.REMOVE;
|
||||
}
|
||||
if (end.get(end.size() - 1) < content.length()) {
|
||||
// after last </blockquote>.
|
||||
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
|
||||
if (dashSignatureHtml.find()) {
|
||||
content = content.substring(0, dashSignatureHtml.start());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No blockquotes found.
|
||||
content = content.substring(0, dashSignatureHtml.start());
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the stripping off of closing tags if a signature was stripped,
|
||||
// as well as clean up the HTML of the quoted message.
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
return HeadFilterDecision.CONTINUE;
|
||||
}
|
||||
|
||||
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setOmitDoctypeDeclaration(false);
|
||||
properties.setTranslateSpecialEntities(false);
|
||||
properties.setRecognizeUnicodeChars(false);
|
||||
@NonNull
|
||||
@Override
|
||||
public TailFilterDecision tail(Node node, int depth) {
|
||||
if (signatureFound) {
|
||||
return TailFilterDecision.CONTINUE;
|
||||
}
|
||||
|
||||
TagNode node = cleaner.clean(content);
|
||||
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
|
||||
content = htmlSerialized.getAsString(node, "UTF8");
|
||||
return content;
|
||||
if (node instanceof Element) {
|
||||
Element element = (Element) node;
|
||||
boolean elementIsBr = element.tag().equals(BR);
|
||||
if (elementIsBr || element.tag().equals(P)) {
|
||||
lastElementCausedLineBreak = true;
|
||||
brElementPrecedingDashes = elementIsBr ? element : null;
|
||||
return TailFilterDecision.CONTINUE;
|
||||
}
|
||||
}
|
||||
|
||||
lastElementCausedLineBreak = false;
|
||||
return TailFilterDecision.CONTINUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package com.fsck.k9.message.signature;
|
|||
|
||||
import com.fsck.k9.K9RobolectricTestRunner;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.robolectric.annotation.Config;
|
||||
|
@ -27,7 +26,6 @@ public class HtmlSignatureRemoverTest {
|
|||
assertEquals("This is the body text", extractText(withoutSignature));
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception {
|
||||
String html = "<html>\r\n" +
|
||||
|
@ -88,8 +86,8 @@ public class HtmlSignatureRemoverTest {
|
|||
assertEquals("<html><head></head><body>" +
|
||||
"<blockquote>" +
|
||||
"This is some quoted text" +
|
||||
"<br />" +
|
||||
"-- <br />" +
|
||||
"<br>" +
|
||||
"-- <br>" +
|
||||
"Inner signature" +
|
||||
"</blockquote>" +
|
||||
"<div>This is the body text</div>" +
|
||||
|
@ -141,7 +139,7 @@ public class HtmlSignatureRemoverTest {
|
|||
String withoutSignature = HtmlSignatureRemover.stripSignature(html);
|
||||
|
||||
assertEquals("<html><head></head><body>" +
|
||||
"This is the body text<br />" +
|
||||
"This is the body text<br>" +
|
||||
"<blockquote>Some quote</blockquote>" +
|
||||
"</body></html>",
|
||||
withoutSignature);
|
||||
|
|
Loading…
Reference in a new issue