Use different method to convert HTML to plain text

Html.fromHtml() exhibited some serious performance issues with certain
inputs.

See issue #3624
This commit is contained in:
cketti 2018-10-03 12:05:44 +02:00
parent 300076c5dd
commit b1cfa302ba
2 changed files with 128 additions and 85 deletions

View file

@ -1,19 +1,14 @@
package com.fsck.k9.message.html;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import android.text.Annotation;
import android.text.Editable;
import android.text.Html;
import android.text.Html.TagHandler;
import android.text.Spannable;
import android.text.Spanned;
import com.fsck.k9.K9;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.xml.sax.XMLReader;
/**
@ -42,88 +37,12 @@ public class HtmlConverter {
* @return Plain text result.
*/
public static String htmlToText(final String html) {
return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
Document document = Jsoup.parse(html);
return HtmlToPlainText.toPlainText(document.body())
.replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
.replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
}
/**
* Custom tag handler to use when converting HTML messages to text. It currently handles text
* representations of HTML tags that Android's built-in parser doesn't understand and hides code
* contained in STYLE and SCRIPT blocks.
*/
private static class HtmlToTextTagHandler implements Html.TagHandler {
// List of tags whose content should be ignored.
private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
static {
Set<String> set = new HashSet<>();
set.add("style");
set.add("script");
set.add("title");
set.add("!"); // comments
TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
}
@Override
public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
tag = tag.toLowerCase(Locale.US);
if (tag.equals("hr") && opening) {
// In the case of an <hr>, replace it with a bunch of underscores. This is roughly
// the behaviour of Outlook in Rich Text mode.
output.append("_____________________________________________\r\n");
} else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
handleIgnoredTag(opening, output);
}
}
private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
/**
* When we come upon an ignored tag, we mark it with an Annotation object with a specific key
* and value as above. We don't really need to be checking these values since Html.fromHtml()
* doesn't use Annotation spans, but we should do it now to be safe in case they do start using
* it in the future.
* @param opening If this is an opening tag or not.
* @param output Spannable string that we're working with.
*/
private void handleIgnoredTag(boolean opening, Editable output) {
int len = output.length();
if (opening) {
output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
len, Spannable.SPAN_MARK_MARK);
} else {
Object start = getOpeningAnnotation(output);
if (start != null) {
int where = output.getSpanStart(start);
// Remove the temporary Annotation span.
output.removeSpan(start);
// Delete everything between the start of the Annotation and the end of the string
// (what we've generated so far).
output.delete(where, len);
}
}
}
/**
* Fetch the matching opening Annotation object and verify that it's the one added by K9.
* @param output Spannable string we're working with.
* @return Starting Annotation object.
*/
private Object getOpeningAnnotation(Editable output) {
Object[] objs = output.getSpans(0, output.length(), Annotation.class);
for (int i = objs.length - 1; i >= 0; i--) {
Annotation span = (Annotation) objs[i];
if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
&& span.getKey().equals(IGNORED_ANNOTATION_KEY)
&& span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
return objs[i];
}
}
return null;
}
}
/**
* Convert a text string into an HTML document.
*

View file

@ -0,0 +1,124 @@
package com.fsck.k9.message.html
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.select.NodeTraversor
import org.jsoup.select.NodeVisitor
/**
* Convert an HTML element to plain text.
*
* Based on Jsoup's HtmlToPlainText example.
*/
object HtmlToPlainText {
@JvmStatic
fun toPlainText(element: Element): String {
val formatter = FormattingVisitor()
NodeTraversor.traverse(formatter, element)
return formatter.toString()
}
}
private class FormattingVisitor : NodeVisitor {
private var width = 0
private val output = StringBuilder()
override fun head(node: Node, depth: Int) {
val name = node.nodeName()
when {
node is TextNode -> append(node.text())
name == "li" -> {
startNewLine()
append("* ")
}
node is Element && node.isBlock -> startNewLine()
}
}
override fun tail(node: Node, depth: Int) {
val name = node.nodeName()
when {
name == "li" -> append("\n")
node is Element && node.isBlock -> {
if (node.hasText()) {
addEmptyLine()
}
}
name == "a" -> {
if (node.absUrl("href").isNotEmpty()) {
append(" <${node.attr("href")}>")
}
}
}
}
private fun append(text: String) {
if (text.startsWith("\n")) {
width = 0
}
if (text == " " && (output.isEmpty() || output.last() in listOf(' ', '\n'))) {
return
}
if (text.length + width > MAX_WIDTH) {
val words = text.split(Regex("\\s+"))
for (i in words.indices) {
var word = words[i]
val last = i == words.size - 1
if (!last) {
word = "$word "
}
if (word.length + width > MAX_WIDTH) {
output.append("\n").append(word)
width = word.length
} else {
output.append(word)
width += word.length
}
}
} else {
output.append(text)
width += text.length
}
}
private fun startNewLine() {
if (output.isEmpty() || output.last() == '\n') {
return
}
append("\n")
}
private fun addEmptyLine() {
if (output.isEmpty() || output.endsWith("\n\n")) {
return
}
startNewLine()
append("\n")
}
override fun toString(): String {
if (output.isEmpty()) {
return ""
}
var lastIndex = output.lastIndex
while (lastIndex >= 0 && output[lastIndex] == '\n') {
lastIndex--
}
return output.substring(0, lastIndex + 1)
}
companion object {
private const val MAX_WIDTH = 76
}
}