Merge pull request #3639 from k9mail/html_to_text

Use Jsoup to convert HTML to plain text
2018-10-18 20:51:23 +02:00 · 2018-10-18 20:51:23 +02:00 · 7b105d7c78
commit 7b105d7c78
parent 300076c5dd b1cfa302ba
2 changed files with 128 additions and 85 deletions
--- a/app/core/src/main/java/com/fsck/k9/message/html/HtmlConverter.java
+++ b/app/core/src/main/java/com/fsck/k9/message/html/HtmlConverter.java
@ -1,19 +1,14 @@
 package com.fsck.k9.message.html;


-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import android.text.Annotation;
 import android.text.Editable;
 import android.text.Html;
 import android.text.Html.TagHandler;
-import android.text.Spannable;
 import android.text.Spanned;

 import com.fsck.k9.K9;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
 import org.xml.sax.XMLReader;

 /**
@ -42,88 +37,12 @@ public class HtmlConverter {
     * @return Plain text result.
     */
    public static String htmlToText(final String html) {
-        return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
+        Document document = Jsoup.parse(html);
+        return HtmlToPlainText.toPlainText(document.body())
               .replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
               .replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
    }

-    /**
-     * Custom tag handler to use when converting HTML messages to text. It currently handles text
-     * representations of HTML tags that Android's built-in parser doesn't understand and hides code
-     * contained in STYLE and SCRIPT blocks.
-     */
-    private static class HtmlToTextTagHandler implements Html.TagHandler {
-        // List of tags whose content should be ignored.
-        private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
-        static {
-            Set<String> set = new HashSet<>();
-            set.add("style");
-            set.add("script");
-            set.add("title");
-            set.add("!");   // comments
-            TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
-        }
-
-        @Override
-        public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
-            tag = tag.toLowerCase(Locale.US);
-            if (tag.equals("hr") && opening) {
-                // In the case of an <hr>, replace it with a bunch of underscores. This is roughly
-                // the behaviour of Outlook in Rich Text mode.
-                output.append("_____________________________________________\r\n");
-            } else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
-                handleIgnoredTag(opening, output);
-            }
-        }
-
-        private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
-        private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
-
-        /**
-         * When we come upon an ignored tag, we mark it with an Annotation object with a specific key
-         * and value as above. We don't really need to be checking these values since Html.fromHtml()
-         * doesn't use Annotation spans, but we should do it now to be safe in case they do start using
-         * it in the future.
-         * @param opening If this is an opening tag or not.
-         * @param output Spannable string that we're working with.
-         */
-        private void handleIgnoredTag(boolean opening, Editable output) {
-            int len = output.length();
-            if (opening) {
-                output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
-                               len, Spannable.SPAN_MARK_MARK);
-            } else {
-                Object start = getOpeningAnnotation(output);
-                if (start != null) {
-                    int where = output.getSpanStart(start);
-                    // Remove the temporary Annotation span.
-                    output.removeSpan(start);
-                    // Delete everything between the start of the Annotation and the end of the string
-                    // (what we've generated so far).
-                    output.delete(where, len);
-                }
-            }
-        }
-
-        /**
-         * Fetch the matching opening Annotation object and verify that it's the one added by K9.
-         * @param output Spannable string we're working with.
-         * @return Starting Annotation object.
-         */
-        private Object getOpeningAnnotation(Editable output) {
-            Object[] objs = output.getSpans(0, output.length(), Annotation.class);
-            for (int i = objs.length - 1; i >= 0; i--) {
-                Annotation span = (Annotation) objs[i];
-                if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
-                        && span.getKey().equals(IGNORED_ANNOTATION_KEY)
-                        && span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
-                    return objs[i];
-                }
-            }
-            return null;
-        }
-    }
-
    /**
     * Convert a text string into an HTML document.
     *
--- a/app/core/src/main/java/com/fsck/k9/message/html/HtmlToPlainText.kt
+++ b/app/core/src/main/java/com/fsck/k9/message/html/HtmlToPlainText.kt
@ -0,0 +1,124 @@
+package com.fsck.k9.message.html
+
+import org.jsoup.nodes.Element
+import org.jsoup.nodes.Node
+import org.jsoup.nodes.TextNode
+import org.jsoup.select.NodeTraversor
+import org.jsoup.select.NodeVisitor
+
+/**
+ * Convert an HTML element to plain text.
+ *
+ * Based on Jsoup's HtmlToPlainText example.
+ */
+object HtmlToPlainText {
+    @JvmStatic
+    fun toPlainText(element: Element): String {
+        val formatter = FormattingVisitor()
+        NodeTraversor.traverse(formatter, element)
+
+        return formatter.toString()
+    }
+}
+
+private class FormattingVisitor : NodeVisitor {
+    private var width = 0
+    private val output = StringBuilder()
+
+    override fun head(node: Node, depth: Int) {
+        val name = node.nodeName()
+        when {
+            node is TextNode -> append(node.text())
+            name == "li" -> {
+                startNewLine()
+                append("* ")
+            }
+            node is Element && node.isBlock -> startNewLine()
+        }
+    }
+
+    override fun tail(node: Node, depth: Int) {
+        val name = node.nodeName()
+        when {
+            name == "li" -> append("\n")
+            node is Element && node.isBlock -> {
+                if (node.hasText()) {
+                    addEmptyLine()
+                }
+            }
+            name == "a" -> {
+                if (node.absUrl("href").isNotEmpty()) {
+                    append(" <${node.attr("href")}>")
+                }
+            }
+        }
+    }
+
+    private fun append(text: String) {
+        if (text.startsWith("\n")) {
+            width = 0
+        }
+
+        if (text == " " && (output.isEmpty() || output.last() in listOf(' ', '\n'))) {
+            return
+        }
+
+        if (text.length + width > MAX_WIDTH) {
+            val words = text.split(Regex("\\s+"))
+            for (i in words.indices) {
+                var word = words[i]
+
+                val last = i == words.size - 1
+                if (!last) {
+                    word = "$word "
+                }
+
+                if (word.length + width > MAX_WIDTH) {
+                    output.append("\n").append(word)
+                    width = word.length
+                } else {
+                    output.append(word)
+                    width += word.length
+                }
+            }
+        } else {
+            output.append(text)
+            width += text.length
+        }
+    }
+
+    private fun startNewLine() {
+        if (output.isEmpty() || output.last() == '\n') {
+            return
+        }
+
+        append("\n")
+    }
+
+    private fun addEmptyLine() {
+        if (output.isEmpty() || output.endsWith("\n\n")) {
+            return
+        }
+
+        startNewLine()
+        append("\n")
+    }
+
+    override fun toString(): String {
+        if (output.isEmpty()) {
+            return ""
+        }
+
+        var lastIndex = output.lastIndex
+        while (lastIndex >= 0 && output[lastIndex] == '\n') {
+            lastIndex--
+        }
+
+        return output.substring(0, lastIndex + 1)
+    }
+
+
+    companion object {
+        private const val MAX_WIDTH = 76
+    }
+}