Merge pull request #4967 from k9mail/improve_preview_extraction
Improve preview text extraction
This commit is contained in:
commit
8dd8881ab5
4 changed files with 326 additions and 224 deletions
|
@ -1,69 +0,0 @@
|
|||
package com.fsck.k9.message.extractors;
|
||||
|
||||
|
||||
import androidx.annotation.NonNull;
|
||||
|
||||
import com.fsck.k9.message.html.HtmlConverter;
|
||||
import com.fsck.k9.mail.Part;
|
||||
import com.fsck.k9.mail.internet.MessageExtractor;
|
||||
|
||||
import static com.fsck.k9.mail.internet.MimeUtility.isSameMimeType;
|
||||
|
||||
|
||||
class PreviewTextExtractor {
|
||||
private static final int MAX_PREVIEW_LENGTH = 512;
|
||||
private static final int MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192;
|
||||
|
||||
|
||||
@NonNull
|
||||
public String extractPreview(@NonNull Part textPart) throws PreviewExtractionException {
|
||||
String text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW);
|
||||
if (text == null) {
|
||||
throw new PreviewExtractionException("Couldn't get text from part");
|
||||
}
|
||||
|
||||
String plainText = convertFromHtmlIfNecessary(textPart, text);
|
||||
|
||||
return stripTextForPreview(plainText);
|
||||
}
|
||||
|
||||
private String convertFromHtmlIfNecessary(Part textPart, String text) {
|
||||
String mimeType = textPart.getMimeType();
|
||||
if (!isSameMimeType(mimeType, "text/html")) {
|
||||
return text;
|
||||
}
|
||||
|
||||
return HtmlConverter.htmlToText(text);
|
||||
}
|
||||
|
||||
private String stripTextForPreview(String text) {
|
||||
if (text == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Remove (correctly delimited by '-- \n') signatures
|
||||
text = text.replaceAll("(?ms)^-- [\\r\\n]+.*", "");
|
||||
// try to remove lines of dashes in the preview
|
||||
text = text.replaceAll("(?m)^----.*?$", "");
|
||||
// remove quoted text from the preview
|
||||
text = text.replaceAll("(?m)^[#>].*$", "");
|
||||
// Remove a common quote header from the preview
|
||||
text = text.replaceAll("(?m)^On .*wrote.?$", "");
|
||||
// Remove a more generic quote header from the preview
|
||||
text = text.replaceAll("(?m)^.*\\w+:$", "");
|
||||
// Remove horizontal rules.
|
||||
text = text.replaceAll("\\s*([-=_]{30,}+)\\s*", " ");
|
||||
|
||||
// URLs in the preview should just be shown as "..." - They're not
|
||||
// clickable and they usually overwhelm the preview
|
||||
text = text.replaceAll("https?://\\S+", "...");
|
||||
// Don't show newlines in the preview
|
||||
text = text.replaceAll("(\\r|\\n)+", " ");
|
||||
// Collapse whitespace in the preview
|
||||
text = text.replaceAll("\\s+", " ");
|
||||
// Remove any whitespace at the beginning and end of the string.
|
||||
text = text.trim();
|
||||
|
||||
return (text.length() > MAX_PREVIEW_LENGTH) ? text.substring(0, MAX_PREVIEW_LENGTH - 1) + "…" : text;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package com.fsck.k9.message.extractors
|
||||
|
||||
import com.fsck.k9.mail.Part
|
||||
import com.fsck.k9.mail.internet.MessageExtractor
|
||||
import com.fsck.k9.mail.internet.MimeUtility.isSameMimeType
|
||||
import com.fsck.k9.message.html.EmailSection
|
||||
import com.fsck.k9.message.html.EmailSectionExtractor
|
||||
import com.fsck.k9.message.html.HtmlConverter
|
||||
|
||||
internal class PreviewTextExtractor {
|
||||
@Throws(PreviewExtractionException::class)
|
||||
fun extractPreview(textPart: Part): String {
|
||||
val text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW)
|
||||
?: throw PreviewExtractionException("Couldn't get text from part")
|
||||
|
||||
val plainText = convertFromHtmlIfNecessary(textPart, text)
|
||||
return stripTextForPreview(plainText)
|
||||
}
|
||||
|
||||
private fun convertFromHtmlIfNecessary(textPart: Part, text: String): String {
|
||||
return if (isSameMimeType(textPart.mimeType, "text/html")) {
|
||||
HtmlConverter.htmlToText(text)
|
||||
} else {
|
||||
text
|
||||
}
|
||||
}
|
||||
|
||||
private fun stripTextForPreview(text: String): String {
|
||||
var intermediateText = text
|
||||
|
||||
intermediateText = normalizeLineBreaks(intermediateText)
|
||||
intermediateText = stripSignature(intermediateText)
|
||||
intermediateText = extractUnquotedText(intermediateText)
|
||||
|
||||
// try to remove lines of dashes in the preview
|
||||
intermediateText = intermediateText.replace("(?m)^----.*?$".toRegex(), "")
|
||||
// Remove horizontal rules.
|
||||
intermediateText = intermediateText.replace("\\s*([-=_]{30,}+)\\s*".toRegex(), " ")
|
||||
|
||||
// URLs in the preview should just be shown as "..." - They're not
|
||||
// clickable and they usually overwhelm the preview
|
||||
intermediateText = intermediateText.replace("https?://\\S+".toRegex(), "...")
|
||||
// Don't show newlines in the preview
|
||||
intermediateText = intermediateText.replace('\n', ' ')
|
||||
// Collapse whitespace in the preview
|
||||
intermediateText = intermediateText.replace("\\s+".toRegex(), " ")
|
||||
// Remove any whitespace at the beginning and end of the string.
|
||||
intermediateText = intermediateText.trim()
|
||||
|
||||
return if (intermediateText.length > MAX_PREVIEW_LENGTH) {
|
||||
intermediateText.substring(0, MAX_PREVIEW_LENGTH - 1) + "…"
|
||||
} else {
|
||||
intermediateText
|
||||
}
|
||||
}
|
||||
|
||||
private fun normalizeLineBreaks(text: String) = text.replace(REGEX_CRLF, "\n")
|
||||
|
||||
private fun stripSignature(text: String): String {
|
||||
return if (text.startsWith("-- \n")) {
|
||||
""
|
||||
} else {
|
||||
text.substringBefore("\n-- \n")
|
||||
}
|
||||
}
|
||||
|
||||
private fun extractUnquotedText(text: String): String {
|
||||
val emailSections = EmailSectionExtractor.extract(text)
|
||||
if (emailSections.isEmpty()) {
|
||||
return ""
|
||||
}
|
||||
|
||||
val firstEmailSection = emailSections.first()
|
||||
val replySections = if (firstEmailSection.quoteDepth == 0) {
|
||||
val replyEmailSections = emailSections.drop(1).filter { it.quoteDepth == 0 && it.isNotBlank() }
|
||||
if (firstEmailSection.isQuoteHeaderOnly()) {
|
||||
replyEmailSections
|
||||
} else {
|
||||
val firstSectionTextWithoutQuoteHeader = stripQuoteHeader(firstEmailSection)
|
||||
listOf(firstSectionTextWithoutQuoteHeader) + replyEmailSections
|
||||
}
|
||||
} else {
|
||||
emailSections.filter { it.quoteDepth == 0 && it.isNotBlank() }
|
||||
}
|
||||
|
||||
return replySections.joinToString(separator = " […] ")
|
||||
}
|
||||
|
||||
private fun stripQuoteHeader(emailSection: EmailSection): String {
|
||||
val quoteHeaderIndex = emailSection.quoteHeaderIndex
|
||||
if (quoteHeaderIndex == -1) return emailSection.toString()
|
||||
return emailSection.substring(startIndex = 0, endIndex = quoteHeaderIndex)
|
||||
}
|
||||
|
||||
private fun EmailSection.isQuoteHeaderOnly(): Boolean {
|
||||
return quoteHeaderIndex == 0
|
||||
}
|
||||
|
||||
private val EmailSection.quoteHeaderIndex: Int
|
||||
get() {
|
||||
var quoteHeaderIndex = lastIndex
|
||||
while (quoteHeaderIndex > 0 && this[quoteHeaderIndex] == '\n') {
|
||||
quoteHeaderIndex--
|
||||
}
|
||||
if (this[quoteHeaderIndex] != ':') return -1
|
||||
|
||||
var newlineCount = 0
|
||||
while (quoteHeaderIndex > 0) {
|
||||
when {
|
||||
this[quoteHeaderIndex] == '\n' -> newlineCount++
|
||||
newlineCount > 1 -> return quoteHeaderIndex + 1
|
||||
else -> newlineCount = 0
|
||||
}
|
||||
quoteHeaderIndex--
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
companion object {
|
||||
private const val MAX_PREVIEW_LENGTH = 512
|
||||
private const val MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192L
|
||||
|
||||
private val REGEX_CRLF = "(\\r\\n|\\r)".toRegex()
|
||||
}
|
||||
}
|
|
@ -1,155 +0,0 @@
|
|||
package com.fsck.k9.message.extractors;
|
||||
|
||||
|
||||
import com.fsck.k9.RobolectricTest;
|
||||
import com.fsck.k9.mail.Part;
|
||||
import com.fsck.k9.mail.internet.MimeBodyPart;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import static com.fsck.k9.message.MessageCreationHelper.createTextPart;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
|
||||
public class PreviewTextExtractorTest extends RobolectricTest {
|
||||
private PreviewTextExtractor previewTextExtractor;
|
||||
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
previewTextExtractor = new PreviewTextExtractor();
|
||||
}
|
||||
|
||||
@Test(expected = PreviewExtractionException.class)
|
||||
public void extractPreview_withEmptyBody_shouldThrow() throws Exception {
|
||||
Part part = new MimeBodyPart(null, "text/plain");
|
||||
|
||||
previewTextExtractor.extractPreview(part);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_withSimpleTextPlain() throws Exception {
|
||||
String text = "The quick brown fox jumps over the lazy dog";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals(text, preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_withSimpleTextHtml() throws Exception {
|
||||
String text = "<b>The quick brown fox jumps over the lazy dog</b>";
|
||||
Part part = createTextPart("text/html", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("The quick brown fox jumps over the lazy dog", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_withLongTextPlain() throws Exception {
|
||||
String text = "" +
|
||||
"10--------20--------30--------40--------50--------" +
|
||||
"60--------70--------80--------90--------100-------" +
|
||||
"110-------120-------130-------140-------150-------" +
|
||||
"160-------170-------180-------190-------200-------" +
|
||||
"210-------220-------230-------240-------250-------" +
|
||||
"260-------270-------280-------290-------300-------" +
|
||||
"310-------320-------330-------340-------350-------" +
|
||||
"360-------370-------380-------390-------400-------" +
|
||||
"410-------420-------430-------440-------450-------" +
|
||||
"460-------470-------480-------490-------500-------" +
|
||||
"510-------520-------";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals(text.substring(0, 511) + "…", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldStripSignature() throws Exception {
|
||||
String text = "" +
|
||||
"Some text\r\n" +
|
||||
"-- \r\n" +
|
||||
"Signature";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("Some text", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldStripHorizontalLine() throws Exception {
|
||||
String text = "" +
|
||||
"line 1\r\n" +
|
||||
"----\r\n" +
|
||||
"line 2";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("line 1 line 2", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldStripQuoteHeaderAndQuotedText() throws Exception {
|
||||
String text = "" +
|
||||
"some text\r\n" +
|
||||
"On 01/02/03 someone wrote\r\n" +
|
||||
"> some quoted text\r\n" +
|
||||
"# some other quoted text\r\n";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("some text", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldStripGenericQuoteHeader() throws Exception {
|
||||
String text = "" +
|
||||
"Am 13.12.2015 um 23:42 schrieb Hans:\r\n" +
|
||||
"> hallo\r\n" +
|
||||
"hi there\r\n";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("hi there", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldStripHorizontalRules() throws Exception {
|
||||
String text = "line 1" +
|
||||
"------------------------------\r\n" +
|
||||
"line 2";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("line 1 line 2", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldReplaceUrl() throws Exception {
|
||||
String text = "some url: https://k9mail.org/";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("some url: ...", preview);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractPreview_shouldCollapseAndTrimWhitespace() throws Exception {
|
||||
String text = " whitespace is\t\tfun ";
|
||||
Part part = createTextPart("text/plain", text);
|
||||
|
||||
String preview = previewTextExtractor.extractPreview(part);
|
||||
|
||||
assertEquals("whitespace is fun", preview);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,200 @@
|
|||
package com.fsck.k9.message.extractors
|
||||
|
||||
import com.fsck.k9.mail.internet.MimeBodyPart
|
||||
import com.fsck.k9.message.MessageCreationHelper
|
||||
import com.google.common.truth.Truth.assertThat
|
||||
import org.junit.Test
|
||||
|
||||
class PreviewTextExtractorTest {
|
||||
private val previewTextExtractor = PreviewTextExtractor()
|
||||
|
||||
@Test(expected = PreviewExtractionException::class)
|
||||
fun extractPreview_withEmptyBody_shouldThrow() {
|
||||
val part = MimeBodyPart(null, "text/plain")
|
||||
|
||||
previewTextExtractor.extractPreview(part)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_withSimpleTextPlain() {
|
||||
val text = "The quick brown fox jumps over the lazy dog"
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo(text)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_withSimpleTextHtml() {
|
||||
val text = "<b>The quick brown fox jumps over the lazy dog</b>"
|
||||
val part = MessageCreationHelper.createTextPart("text/html", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("The quick brown fox jumps over the lazy dog")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_withLongTextPlain() {
|
||||
val text = "" +
|
||||
"10--------20--------30--------40--------50--------" +
|
||||
"60--------70--------80--------90--------100-------" +
|
||||
"110-------120-------130-------140-------150-------" +
|
||||
"160-------170-------180-------190-------200-------" +
|
||||
"210-------220-------230-------240-------250-------" +
|
||||
"260-------270-------280-------290-------300-------" +
|
||||
"310-------320-------330-------340-------350-------" +
|
||||
"360-------370-------380-------390-------400-------" +
|
||||
"410-------420-------430-------440-------450-------" +
|
||||
"460-------470-------480-------490-------500-------" +
|
||||
"510-------520-------"
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo(text.substring(0, 511) + "…")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldStripSignature() {
|
||||
val text = """
|
||||
Some text
|
||||
--
|
||||
Signature
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("Some text")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldStripHorizontalLine() {
|
||||
val text = """
|
||||
line 1
|
||||
----
|
||||
line 2
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("line 1 line 2")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldStripQuoteHeaderAndQuotedText() {
|
||||
val text = """
|
||||
some text
|
||||
|
||||
On 01/02/03 someone wrote:
|
||||
> some quoted text
|
||||
> some other quoted text
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("some text")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldStripGenericQuoteHeader() {
|
||||
val text = """
|
||||
Am 13.12.2015 um 23:42 schrieb Hans:
|
||||
> hallo
|
||||
hi there
|
||||
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("hi there")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldStripHorizontalRules() {
|
||||
val text = """
|
||||
line 1------------------------------
|
||||
line 2
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("line 1 line 2")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldReplaceUrl() {
|
||||
val text = "some url: https://k9mail.org/"
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("some url: ...")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_shouldCollapseAndTrimWhitespace() {
|
||||
val text = " whitespace is\t\tfun "
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("whitespace is fun")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_lineEndingWithColon() {
|
||||
val text = """
|
||||
Here's a list:
|
||||
- item 1
|
||||
- item 2
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("Here's a list: - item 1 - item 2")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_inlineReplies() {
|
||||
val text = """
|
||||
On 2020-09-30 at 03:12 Bob wrote:
|
||||
> Hi Alice
|
||||
Hi Bob
|
||||
|
||||
> How are you?
|
||||
I'm fine. Thanks for asking.
|
||||
|
||||
> Bye
|
||||
See you tomorrow
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("Hi Bob […] I'm fine. Thanks for asking. […] See you tomorrow")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun extractPreview_quoteHeaderContainingLineBreak() {
|
||||
val text = """
|
||||
Reply text
|
||||
|
||||
On 2020-09-30 at 03:12
|
||||
Bob wrote:
|
||||
> Quoted text
|
||||
""".trimIndent()
|
||||
val part = MessageCreationHelper.createTextPart("text/plain", text)
|
||||
|
||||
val preview = previewTextExtractor.extractPreview(part)
|
||||
|
||||
assertThat(preview).isEqualTo("Reply text")
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue