Merge pull request #4967 from k9mail/improve_preview_extraction

Improve preview text extraction
This commit is contained in:
cketti 2020-10-03 03:41:52 +02:00 committed by GitHub
commit 8dd8881ab5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 326 additions and 224 deletions

View file

@ -1,69 +0,0 @@
package com.fsck.k9.message.extractors;
import androidx.annotation.NonNull;
import com.fsck.k9.message.html.HtmlConverter;
import com.fsck.k9.mail.Part;
import com.fsck.k9.mail.internet.MessageExtractor;
import static com.fsck.k9.mail.internet.MimeUtility.isSameMimeType;
class PreviewTextExtractor {
private static final int MAX_PREVIEW_LENGTH = 512;
private static final int MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192;
@NonNull
public String extractPreview(@NonNull Part textPart) throws PreviewExtractionException {
String text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW);
if (text == null) {
throw new PreviewExtractionException("Couldn't get text from part");
}
String plainText = convertFromHtmlIfNecessary(textPart, text);
return stripTextForPreview(plainText);
}
private String convertFromHtmlIfNecessary(Part textPart, String text) {
String mimeType = textPart.getMimeType();
if (!isSameMimeType(mimeType, "text/html")) {
return text;
}
return HtmlConverter.htmlToText(text);
}
private String stripTextForPreview(String text) {
if (text == null) {
return "";
}
// Remove (correctly delimited by '-- \n') signatures
text = text.replaceAll("(?ms)^-- [\\r\\n]+.*", "");
// try to remove lines of dashes in the preview
text = text.replaceAll("(?m)^----.*?$", "");
// remove quoted text from the preview
text = text.replaceAll("(?m)^[#>].*$", "");
// Remove a common quote header from the preview
text = text.replaceAll("(?m)^On .*wrote.?$", "");
// Remove a more generic quote header from the preview
text = text.replaceAll("(?m)^.*\\w+:$", "");
// Remove horizontal rules.
text = text.replaceAll("\\s*([-=_]{30,}+)\\s*", " ");
// URLs in the preview should just be shown as "..." - They're not
// clickable and they usually overwhelm the preview
text = text.replaceAll("https?://\\S+", "...");
// Don't show newlines in the preview
text = text.replaceAll("(\\r|\\n)+", " ");
// Collapse whitespace in the preview
text = text.replaceAll("\\s+", " ");
// Remove any whitespace at the beginning and end of the string.
text = text.trim();
return (text.length() > MAX_PREVIEW_LENGTH) ? text.substring(0, MAX_PREVIEW_LENGTH - 1) + "" : text;
}
}

View file

@ -0,0 +1,126 @@
package com.fsck.k9.message.extractors
import com.fsck.k9.mail.Part
import com.fsck.k9.mail.internet.MessageExtractor
import com.fsck.k9.mail.internet.MimeUtility.isSameMimeType
import com.fsck.k9.message.html.EmailSection
import com.fsck.k9.message.html.EmailSectionExtractor
import com.fsck.k9.message.html.HtmlConverter
internal class PreviewTextExtractor {
@Throws(PreviewExtractionException::class)
fun extractPreview(textPart: Part): String {
val text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW)
?: throw PreviewExtractionException("Couldn't get text from part")
val plainText = convertFromHtmlIfNecessary(textPart, text)
return stripTextForPreview(plainText)
}
private fun convertFromHtmlIfNecessary(textPart: Part, text: String): String {
return if (isSameMimeType(textPart.mimeType, "text/html")) {
HtmlConverter.htmlToText(text)
} else {
text
}
}
private fun stripTextForPreview(text: String): String {
var intermediateText = text
intermediateText = normalizeLineBreaks(intermediateText)
intermediateText = stripSignature(intermediateText)
intermediateText = extractUnquotedText(intermediateText)
// try to remove lines of dashes in the preview
intermediateText = intermediateText.replace("(?m)^----.*?$".toRegex(), "")
// Remove horizontal rules.
intermediateText = intermediateText.replace("\\s*([-=_]{30,}+)\\s*".toRegex(), " ")
// URLs in the preview should just be shown as "..." - They're not
// clickable and they usually overwhelm the preview
intermediateText = intermediateText.replace("https?://\\S+".toRegex(), "...")
// Don't show newlines in the preview
intermediateText = intermediateText.replace('\n', ' ')
// Collapse whitespace in the preview
intermediateText = intermediateText.replace("\\s+".toRegex(), " ")
// Remove any whitespace at the beginning and end of the string.
intermediateText = intermediateText.trim()
return if (intermediateText.length > MAX_PREVIEW_LENGTH) {
intermediateText.substring(0, MAX_PREVIEW_LENGTH - 1) + ""
} else {
intermediateText
}
}
private fun normalizeLineBreaks(text: String) = text.replace(REGEX_CRLF, "\n")
private fun stripSignature(text: String): String {
return if (text.startsWith("-- \n")) {
""
} else {
text.substringBefore("\n-- \n")
}
}
private fun extractUnquotedText(text: String): String {
val emailSections = EmailSectionExtractor.extract(text)
if (emailSections.isEmpty()) {
return ""
}
val firstEmailSection = emailSections.first()
val replySections = if (firstEmailSection.quoteDepth == 0) {
val replyEmailSections = emailSections.drop(1).filter { it.quoteDepth == 0 && it.isNotBlank() }
if (firstEmailSection.isQuoteHeaderOnly()) {
replyEmailSections
} else {
val firstSectionTextWithoutQuoteHeader = stripQuoteHeader(firstEmailSection)
listOf(firstSectionTextWithoutQuoteHeader) + replyEmailSections
}
} else {
emailSections.filter { it.quoteDepth == 0 && it.isNotBlank() }
}
return replySections.joinToString(separator = " […] ")
}
private fun stripQuoteHeader(emailSection: EmailSection): String {
val quoteHeaderIndex = emailSection.quoteHeaderIndex
if (quoteHeaderIndex == -1) return emailSection.toString()
return emailSection.substring(startIndex = 0, endIndex = quoteHeaderIndex)
}
private fun EmailSection.isQuoteHeaderOnly(): Boolean {
return quoteHeaderIndex == 0
}
private val EmailSection.quoteHeaderIndex: Int
get() {
var quoteHeaderIndex = lastIndex
while (quoteHeaderIndex > 0 && this[quoteHeaderIndex] == '\n') {
quoteHeaderIndex--
}
if (this[quoteHeaderIndex] != ':') return -1
var newlineCount = 0
while (quoteHeaderIndex > 0) {
when {
this[quoteHeaderIndex] == '\n' -> newlineCount++
newlineCount > 1 -> return quoteHeaderIndex + 1
else -> newlineCount = 0
}
quoteHeaderIndex--
}
return 0
}
companion object {
private const val MAX_PREVIEW_LENGTH = 512
private const val MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192L
private val REGEX_CRLF = "(\\r\\n|\\r)".toRegex()
}
}

View file

@ -1,155 +0,0 @@
package com.fsck.k9.message.extractors;
import com.fsck.k9.RobolectricTest;
import com.fsck.k9.mail.Part;
import com.fsck.k9.mail.internet.MimeBodyPart;
import org.junit.Before;
import org.junit.Test;
import static com.fsck.k9.message.MessageCreationHelper.createTextPart;
import static org.junit.Assert.assertEquals;
public class PreviewTextExtractorTest extends RobolectricTest {
private PreviewTextExtractor previewTextExtractor;
@Before
public void setUp() throws Exception {
previewTextExtractor = new PreviewTextExtractor();
}
@Test(expected = PreviewExtractionException.class)
public void extractPreview_withEmptyBody_shouldThrow() throws Exception {
Part part = new MimeBodyPart(null, "text/plain");
previewTextExtractor.extractPreview(part);
}
@Test
public void extractPreview_withSimpleTextPlain() throws Exception {
String text = "The quick brown fox jumps over the lazy dog";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals(text, preview);
}
@Test
public void extractPreview_withSimpleTextHtml() throws Exception {
String text = "<b>The quick brown fox jumps over the lazy dog</b>";
Part part = createTextPart("text/html", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("The quick brown fox jumps over the lazy dog", preview);
}
@Test
public void extractPreview_withLongTextPlain() throws Exception {
String text = "" +
"10--------20--------30--------40--------50--------" +
"60--------70--------80--------90--------100-------" +
"110-------120-------130-------140-------150-------" +
"160-------170-------180-------190-------200-------" +
"210-------220-------230-------240-------250-------" +
"260-------270-------280-------290-------300-------" +
"310-------320-------330-------340-------350-------" +
"360-------370-------380-------390-------400-------" +
"410-------420-------430-------440-------450-------" +
"460-------470-------480-------490-------500-------" +
"510-------520-------";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals(text.substring(0, 511) + "", preview);
}
@Test
public void extractPreview_shouldStripSignature() throws Exception {
String text = "" +
"Some text\r\n" +
"-- \r\n" +
"Signature";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("Some text", preview);
}
@Test
public void extractPreview_shouldStripHorizontalLine() throws Exception {
String text = "" +
"line 1\r\n" +
"----\r\n" +
"line 2";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("line 1 line 2", preview);
}
@Test
public void extractPreview_shouldStripQuoteHeaderAndQuotedText() throws Exception {
String text = "" +
"some text\r\n" +
"On 01/02/03 someone wrote\r\n" +
"> some quoted text\r\n" +
"# some other quoted text\r\n";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("some text", preview);
}
@Test
public void extractPreview_shouldStripGenericQuoteHeader() throws Exception {
String text = "" +
"Am 13.12.2015 um 23:42 schrieb Hans:\r\n" +
"> hallo\r\n" +
"hi there\r\n";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("hi there", preview);
}
@Test
public void extractPreview_shouldStripHorizontalRules() throws Exception {
String text = "line 1" +
"------------------------------\r\n" +
"line 2";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("line 1 line 2", preview);
}
@Test
public void extractPreview_shouldReplaceUrl() throws Exception {
String text = "some url: https://k9mail.org/";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("some url: ...", preview);
}
@Test
public void extractPreview_shouldCollapseAndTrimWhitespace() throws Exception {
String text = " whitespace is\t\tfun ";
Part part = createTextPart("text/plain", text);
String preview = previewTextExtractor.extractPreview(part);
assertEquals("whitespace is fun", preview);
}
}

View file

@ -0,0 +1,200 @@
package com.fsck.k9.message.extractors
import com.fsck.k9.mail.internet.MimeBodyPart
import com.fsck.k9.message.MessageCreationHelper
import com.google.common.truth.Truth.assertThat
import org.junit.Test
class PreviewTextExtractorTest {
private val previewTextExtractor = PreviewTextExtractor()
@Test(expected = PreviewExtractionException::class)
fun extractPreview_withEmptyBody_shouldThrow() {
val part = MimeBodyPart(null, "text/plain")
previewTextExtractor.extractPreview(part)
}
@Test
fun extractPreview_withSimpleTextPlain() {
val text = "The quick brown fox jumps over the lazy dog"
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo(text)
}
@Test
fun extractPreview_withSimpleTextHtml() {
val text = "<b>The quick brown fox jumps over the lazy dog</b>"
val part = MessageCreationHelper.createTextPart("text/html", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("The quick brown fox jumps over the lazy dog")
}
@Test
fun extractPreview_withLongTextPlain() {
val text = "" +
"10--------20--------30--------40--------50--------" +
"60--------70--------80--------90--------100-------" +
"110-------120-------130-------140-------150-------" +
"160-------170-------180-------190-------200-------" +
"210-------220-------230-------240-------250-------" +
"260-------270-------280-------290-------300-------" +
"310-------320-------330-------340-------350-------" +
"360-------370-------380-------390-------400-------" +
"410-------420-------430-------440-------450-------" +
"460-------470-------480-------490-------500-------" +
"510-------520-------"
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo(text.substring(0, 511) + "")
}
@Test
fun extractPreview_shouldStripSignature() {
val text = """
Some text
--
Signature
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("Some text")
}
@Test
fun extractPreview_shouldStripHorizontalLine() {
val text = """
line 1
----
line 2
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("line 1 line 2")
}
@Test
fun extractPreview_shouldStripQuoteHeaderAndQuotedText() {
val text = """
some text
On 01/02/03 someone wrote:
> some quoted text
> some other quoted text
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("some text")
}
@Test
fun extractPreview_shouldStripGenericQuoteHeader() {
val text = """
Am 13.12.2015 um 23:42 schrieb Hans:
> hallo
hi there
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("hi there")
}
@Test
fun extractPreview_shouldStripHorizontalRules() {
val text = """
line 1------------------------------
line 2
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("line 1 line 2")
}
@Test
fun extractPreview_shouldReplaceUrl() {
val text = "some url: https://k9mail.org/"
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("some url: ...")
}
@Test
fun extractPreview_shouldCollapseAndTrimWhitespace() {
val text = " whitespace is\t\tfun "
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("whitespace is fun")
}
@Test
fun extractPreview_lineEndingWithColon() {
val text = """
Here's a list:
- item 1
- item 2
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("Here's a list: - item 1 - item 2")
}
@Test
fun extractPreview_inlineReplies() {
val text = """
On 2020-09-30 at 03:12 Bob wrote:
> Hi Alice
Hi Bob
> How are you?
I'm fine. Thanks for asking.
> Bye
See you tomorrow
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("Hi Bob […] I'm fine. Thanks for asking. […] See you tomorrow")
}
@Test
fun extractPreview_quoteHeaderContainingLineBreak() {
val text = """
Reply text
On 2020-09-30 at 03:12
Bob wrote:
> Quoted text
""".trimIndent()
val part = MessageCreationHelper.createTextPart("text/plain", text)
val preview = previewTextExtractor.extractPreview(part)
assertThat(preview).isEqualTo("Reply text")
}
}