Use different method to convert HTML to plain text
Html.fromHtml() exhibited some serious performance issues with certain inputs. See issue #3624
This commit is contained in:
parent
300076c5dd
commit
b1cfa302ba
2 changed files with 128 additions and 85 deletions
|
@ -1,19 +1,14 @@
|
|||
package com.fsck.k9.message.html;
|
||||
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import android.text.Annotation;
|
||||
import android.text.Editable;
|
||||
import android.text.Html;
|
||||
import android.text.Html.TagHandler;
|
||||
import android.text.Spannable;
|
||||
import android.text.Spanned;
|
||||
|
||||
import com.fsck.k9.K9;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.xml.sax.XMLReader;
|
||||
|
||||
/**
|
||||
|
@ -42,88 +37,12 @@ public class HtmlConverter {
|
|||
* @return Plain text result.
|
||||
*/
|
||||
public static String htmlToText(final String html) {
|
||||
return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
|
||||
Document document = Jsoup.parse(html);
|
||||
return HtmlToPlainText.toPlainText(document.body())
|
||||
.replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
|
||||
.replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom tag handler to use when converting HTML messages to text. It currently handles text
|
||||
* representations of HTML tags that Android's built-in parser doesn't understand and hides code
|
||||
* contained in STYLE and SCRIPT blocks.
|
||||
*/
|
||||
private static class HtmlToTextTagHandler implements Html.TagHandler {
|
||||
// List of tags whose content should be ignored.
|
||||
private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
|
||||
static {
|
||||
Set<String> set = new HashSet<>();
|
||||
set.add("style");
|
||||
set.add("script");
|
||||
set.add("title");
|
||||
set.add("!"); // comments
|
||||
TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
|
||||
tag = tag.toLowerCase(Locale.US);
|
||||
if (tag.equals("hr") && opening) {
|
||||
// In the case of an <hr>, replace it with a bunch of underscores. This is roughly
|
||||
// the behaviour of Outlook in Rich Text mode.
|
||||
output.append("_____________________________________________\r\n");
|
||||
} else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
|
||||
handleIgnoredTag(opening, output);
|
||||
}
|
||||
}
|
||||
|
||||
private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
|
||||
private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
|
||||
|
||||
/**
|
||||
* When we come upon an ignored tag, we mark it with an Annotation object with a specific key
|
||||
* and value as above. We don't really need to be checking these values since Html.fromHtml()
|
||||
* doesn't use Annotation spans, but we should do it now to be safe in case they do start using
|
||||
* it in the future.
|
||||
* @param opening If this is an opening tag or not.
|
||||
* @param output Spannable string that we're working with.
|
||||
*/
|
||||
private void handleIgnoredTag(boolean opening, Editable output) {
|
||||
int len = output.length();
|
||||
if (opening) {
|
||||
output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
|
||||
len, Spannable.SPAN_MARK_MARK);
|
||||
} else {
|
||||
Object start = getOpeningAnnotation(output);
|
||||
if (start != null) {
|
||||
int where = output.getSpanStart(start);
|
||||
// Remove the temporary Annotation span.
|
||||
output.removeSpan(start);
|
||||
// Delete everything between the start of the Annotation and the end of the string
|
||||
// (what we've generated so far).
|
||||
output.delete(where, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the matching opening Annotation object and verify that it's the one added by K9.
|
||||
* @param output Spannable string we're working with.
|
||||
* @return Starting Annotation object.
|
||||
*/
|
||||
private Object getOpeningAnnotation(Editable output) {
|
||||
Object[] objs = output.getSpans(0, output.length(), Annotation.class);
|
||||
for (int i = objs.length - 1; i >= 0; i--) {
|
||||
Annotation span = (Annotation) objs[i];
|
||||
if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
|
||||
&& span.getKey().equals(IGNORED_ANNOTATION_KEY)
|
||||
&& span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
|
||||
return objs[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a text string into an HTML document.
|
||||
*
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
package com.fsck.k9.message.html
|
||||
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.nodes.Node
|
||||
import org.jsoup.nodes.TextNode
|
||||
import org.jsoup.select.NodeTraversor
|
||||
import org.jsoup.select.NodeVisitor
|
||||
|
||||
/**
|
||||
* Convert an HTML element to plain text.
|
||||
*
|
||||
* Based on Jsoup's HtmlToPlainText example.
|
||||
*/
|
||||
object HtmlToPlainText {
|
||||
@JvmStatic
|
||||
fun toPlainText(element: Element): String {
|
||||
val formatter = FormattingVisitor()
|
||||
NodeTraversor.traverse(formatter, element)
|
||||
|
||||
return formatter.toString()
|
||||
}
|
||||
}
|
||||
|
||||
private class FormattingVisitor : NodeVisitor {
|
||||
private var width = 0
|
||||
private val output = StringBuilder()
|
||||
|
||||
override fun head(node: Node, depth: Int) {
|
||||
val name = node.nodeName()
|
||||
when {
|
||||
node is TextNode -> append(node.text())
|
||||
name == "li" -> {
|
||||
startNewLine()
|
||||
append("* ")
|
||||
}
|
||||
node is Element && node.isBlock -> startNewLine()
|
||||
}
|
||||
}
|
||||
|
||||
override fun tail(node: Node, depth: Int) {
|
||||
val name = node.nodeName()
|
||||
when {
|
||||
name == "li" -> append("\n")
|
||||
node is Element && node.isBlock -> {
|
||||
if (node.hasText()) {
|
||||
addEmptyLine()
|
||||
}
|
||||
}
|
||||
name == "a" -> {
|
||||
if (node.absUrl("href").isNotEmpty()) {
|
||||
append(" <${node.attr("href")}>")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun append(text: String) {
|
||||
if (text.startsWith("\n")) {
|
||||
width = 0
|
||||
}
|
||||
|
||||
if (text == " " && (output.isEmpty() || output.last() in listOf(' ', '\n'))) {
|
||||
return
|
||||
}
|
||||
|
||||
if (text.length + width > MAX_WIDTH) {
|
||||
val words = text.split(Regex("\\s+"))
|
||||
for (i in words.indices) {
|
||||
var word = words[i]
|
||||
|
||||
val last = i == words.size - 1
|
||||
if (!last) {
|
||||
word = "$word "
|
||||
}
|
||||
|
||||
if (word.length + width > MAX_WIDTH) {
|
||||
output.append("\n").append(word)
|
||||
width = word.length
|
||||
} else {
|
||||
output.append(word)
|
||||
width += word.length
|
||||
}
|
||||
}
|
||||
} else {
|
||||
output.append(text)
|
||||
width += text.length
|
||||
}
|
||||
}
|
||||
|
||||
private fun startNewLine() {
|
||||
if (output.isEmpty() || output.last() == '\n') {
|
||||
return
|
||||
}
|
||||
|
||||
append("\n")
|
||||
}
|
||||
|
||||
private fun addEmptyLine() {
|
||||
if (output.isEmpty() || output.endsWith("\n\n")) {
|
||||
return
|
||||
}
|
||||
|
||||
startNewLine()
|
||||
append("\n")
|
||||
}
|
||||
|
||||
override fun toString(): String {
|
||||
if (output.isEmpty()) {
|
||||
return ""
|
||||
}
|
||||
|
||||
var lastIndex = output.lastIndex
|
||||
while (lastIndex >= 0 && output[lastIndex] == '\n') {
|
||||
lastIndex--
|
||||
}
|
||||
|
||||
return output.substring(0, lastIndex + 1)
|
||||
}
|
||||
|
||||
|
||||
companion object {
|
||||
private const val MAX_WIDTH = 76
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue