Merge pull request #3132 from k9mail/email_section_extractor

TextToHtml: Extract sections from a plain text email
This commit is contained in:
cketti 2018-01-26 15:10:02 +01:00 committed by GitHub
commit 5931d46a42
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 450 additions and 1 deletions

View file

@ -25,6 +25,7 @@ dependencies {
compile project(':k9mail-library')
compile project(':plugins:HoloColorPicker')
compile project(':plugins:openpgp-api-lib:openpgp-api')
compile "org.jetbrains.kotlin:kotlin-stdlib-jre7:${kotlinVersion}"
compile "com.squareup.okio:okio:${okioVersion}"
compile 'commons-io:commons-io:2.4'
compile "com.android.support:support-v4:${androidSupportLibraryVersion}"
@ -40,7 +41,6 @@ dependencies {
androidTestCompile 'com.android.support.test.espresso:espresso-core:2.2.2'
testCompile "org.jetbrains.kotlin:kotlin-stdlib-jre7:${kotlinVersion}"
testCompile "org.robolectric:robolectric:${robolectricVersion}"
testCompile "junit:junit:${junitVersion}"
testCompile "com.google.truth:truth:${truthVersion}"

View file

@ -0,0 +1,117 @@
package com.fsck.k9.message.html
/**
* Represents a section of an email's plain text body.
*
* See [EmailSectionExtractor].
*/
class EmailSection private constructor(builder: Builder) : CharSequence {
val quoteDepth = builder.quoteDepth
private val text = builder.text
private val segments: List<Segment> = if (builder.indent == 0) {
builder.segments.toList()
} else {
builder.segments.map { Segment(it.startIndex + builder.indent, it.endIndex) }
}
override val length = segments.map { it.endIndex - it.startIndex }.sum()
override fun get(index: Int): Char {
require(index in 0..(length - 1)) { "index: $index; length: $length" }
var offset = index
for (i in 0..(segments.size - 1)) {
val segment = segments[i]
val segmentLength = segment.endIndex - segment.startIndex
if (offset < segmentLength) {
return text[segment.startIndex + offset]
}
offset -= segmentLength
}
throw AssertionError()
}
override fun subSequence(startIndex: Int, endIndex: Int): CharSequence {
require(startIndex in 0..(length - 1)) { "startIndex: $startIndex; length: $length" }
require(endIndex in 0..length) { "endIndex: $endIndex; length: $length" }
require(startIndex <= endIndex) { "startIndex > endIndex" }
if (startIndex == endIndex) return ""
if (startIndex == 0 && endIndex == length) return this
val builder = Builder(text, quoteDepth)
val (startSegmentIndex, startOffset) = findSegmentIndexAndOffset(startIndex)
val (endSegmentIndex, endOffset) = findSegmentIndexAndOffset(endIndex, isEndIndex = true)
val startSegment = segments[startSegmentIndex]
if (startSegmentIndex == endSegmentIndex) {
builder.addSegment(0, startSegment.startIndex + startOffset, startSegment.startIndex + endOffset)
return builder.build()
}
if (startOffset == 0) {
builder.addSegment(startSegment)
} else {
builder.addSegment(0, startSegment.startIndex + startOffset, startSegment.endIndex)
}
for (segmentIndex in startSegmentIndex + 1 until endSegmentIndex) {
builder.addSegment(segments[segmentIndex])
}
val endSegment = segments[endSegmentIndex]
if (endSegment.startIndex + endOffset == endSegment.endIndex) {
builder.addSegment(endSegment)
} else {
builder.addSegment(0, endSegment.startIndex, endSegment.startIndex + endOffset)
}
return builder.build()
}
private fun findSegmentIndexAndOffset(index: Int, isEndIndex: Boolean = false): Pair<Int, Int> {
var offset = index
segments.forEachIndexed { segmentIndex, segment ->
val segmentLength = segment.endIndex - segment.startIndex
if (offset < segmentLength || (isEndIndex && offset == segmentLength)) {
return Pair(segmentIndex, offset)
}
offset -= segmentLength
}
throw AssertionError()
}
override fun toString() = StringBuilder().apply {
segments.forEach {
append(text, it.startIndex, it.endIndex)
}
}.toString()
internal data class Segment(val startIndex: Int, val endIndex: Int)
class Builder(val text: String, val quoteDepth: Int) {
internal val segments: MutableList<Segment> = mutableListOf()
internal var indent = Int.MAX_VALUE
val hasSegments
get() = !segments.isEmpty()
fun addSegment(leadingSpaces: Int, startIndex: Int, endIndex: Int): Builder {
indent = minOf(indent, leadingSpaces)
segments.add(Segment(startIndex, endIndex))
return this
}
internal fun addSegment(segment: Segment) {
indent = 0
segments.add(segment)
}
fun build() = EmailSection(this)
}
}

View file

@ -0,0 +1,125 @@
package com.fsck.k9.message.html
/**
* Extract sections from a plain text email.
*
* A section consists of all consecutive lines of the same quote depth. Quote characters and spaces at the beginning of
* a line are stripped and not part of the section's content.
*
* ### Example:
*
* ```
* On 2018-01-25 Alice <alice@example.com> wrote:
* > Hi Bob
*
* Hi Alice
* ```
*
* This message consists of three sections with the following contents:
* * `On 2018-01-25 Alice <alice@example.com> wrote:`
* * `Hi Bob`
* * `Hi Alice`
*/
class EmailSectionExtractor private constructor(val text: String) {
private val sections = mutableListOf<EmailSection>()
private var sectionBuilder = EmailSection.Builder(text, 0)
private var sectionStartIndex = 0
private var newlineIndex = -1
private var startOfContentIndex = 0
private var isStartOfLine = true
private var spaces = 0
private var quoteDepth = 0
private var currentQuoteDepth = 0
fun extract(): List<EmailSection> {
text.forEachIndexed { index, character ->
if (isStartOfLine) {
detectQuoteCharacters(index, character)
} else if (character == '\n') {
addQuotedLineToSection(endIndex = index + 1)
}
if (character == '\n') {
newlineIndex = index
resetForStartOfLine()
}
}
completeLastSection()
return sections
}
private fun detectQuoteCharacters(index: Int, character: Char) {
when (character) {
' ' -> spaces++
'>' -> {
currentQuoteDepth++
spaces = 0
}
'\n' -> {
if (quoteDepth == currentQuoteDepth) {
addQuotedLineToSection(startIndex = index - spaces, endIndex = index + 1)
} else {
finishSection(index + 1)
sectionStartIndex = index - spaces
}
}
else -> {
isStartOfLine = false
startOfContentIndex = index - spaces
if (quoteDepth != currentQuoteDepth) {
finishSection(newlineIndex + 1)
sectionStartIndex = startOfContentIndex
}
}
}
}
private fun addUnquotedLineToSection(endIndex: Int) {
if (quoteDepth == 0 && sectionStartIndex != endIndex) {
sectionBuilder.addSegment(0, sectionStartIndex, endIndex)
}
}
private fun addQuotedLineToSection(startIndex: Int = startOfContentIndex, endIndex: Int) {
if (currentQuoteDepth > 0) {
sectionBuilder.addSegment(spaces, startIndex, endIndex)
}
}
private fun finishSection(endIndex: Int) {
addUnquotedLineToSection(endIndex)
appendSection()
sectionBuilder = EmailSection.Builder(text, currentQuoteDepth)
quoteDepth = currentQuoteDepth
}
private fun completeLastSection() {
if (!isStartOfLine) {
if (quoteDepth == 0) {
sectionBuilder.addSegment(0, sectionStartIndex, text.length)
} else {
sectionBuilder.addSegment(spaces, startOfContentIndex, text.length)
}
}
appendSection()
}
private fun appendSection() {
if (sectionBuilder.hasSegments) {
sections.add(sectionBuilder.build())
}
}
private fun resetForStartOfLine() {
isStartOfLine = true
currentQuoteDepth = 0
spaces = 0
}
companion object {
fun extract(text: String) = EmailSectionExtractor(text).extract()
}
}

View file

@ -0,0 +1,113 @@
package com.fsck.k9.message.html
import com.google.common.truth.Truth.assertThat
import org.junit.Test
class EmailSectionExtractorTest {
@Test
fun simpleMessageWithoutQuotes() {
val message = """
Hi Alice,
are we still on for new Thursday?
Best
Bob
""".trimIndent()
val sections = EmailSectionExtractor.extract(message)
assertThat(sections.size).isEqualTo(1)
with(sections[0]) {
assertThat(quoteDepth).isEqualTo(0)
assertThat(toString()).isEqualTo(message)
}
}
@Test
fun quoteFollowedByReply() {
val message = """
Alice <alice@example.org> wrote:
> Hi there
Hi, what's up?
""".trimIndent()
val sections = EmailSectionExtractor.extract(message)
assertThat(sections.size).isEqualTo(3)
with(sections[0]) {
assertThat(quoteDepth).isEqualTo(0)
assertThat(toString()).isEqualTo("Alice <alice@example.org> wrote:\n")
}
with(sections[1]) {
assertThat(quoteDepth).isEqualTo(1)
assertThat(toString()).isEqualTo("Hi there\n")
}
with(sections[2]) {
assertThat(quoteDepth).isEqualTo(0)
assertThat(toString()).isEqualTo("\nHi, what's up?")
}
}
@Test
fun replyFollowedByTwoQuoteLevels() {
val message = """
Three
Bob <bob@example.org> wrote:
> Two
>${" "}
> Alice <alice@example.org> wrote:
>> One
""".trimIndent()
val sections = EmailSectionExtractor.extract(message)
assertThat(sections.size).isEqualTo(3)
with(sections[0]) {
assertThat(quoteDepth).isEqualTo(0)
assertThat(toString()).isEqualTo("Three\n\nBob <bob@example.org> wrote:\n")
}
with(sections[1]) {
assertThat(quoteDepth).isEqualTo(1)
assertThat(toString()).isEqualTo("Two\n\nAlice <alice@example.org> wrote:\n")
}
with(sections[2]) {
assertThat(quoteDepth).isEqualTo(2)
assertThat(toString()).isEqualTo("One")
}
}
@Test
fun chaosQuoting() {
val message = """
>>> One
> Three
Four
>> Two${"\n"}
""".trimIndent()
val sections = EmailSectionExtractor.extract(message)
assertThat(sections.size).isEqualTo(4)
with(sections[0]) {
assertThat(quoteDepth).isEqualTo(3)
assertThat(toString()).isEqualTo("One\n")
}
with(sections[1]) {
assertThat(quoteDepth).isEqualTo(1)
assertThat(toString()).isEqualTo("Three\n")
}
with(sections[2]) {
assertThat(quoteDepth).isEqualTo(0)
assertThat(toString()).isEqualTo("Four\n")
}
with(sections[3]) {
assertThat(quoteDepth).isEqualTo(2)
assertThat(toString()).isEqualTo("Two\n")
}
}
}

View file

@ -0,0 +1,94 @@
package com.fsck.k9.message.html
import com.google.common.truth.Truth.assertThat
import org.junit.Test
class EmailSectionTest {
@Test
fun charAt() {
assertThat("[a]".asEmailSection()[0]).isEqualTo('a')
assertThat(".[a]".asEmailSection()[0]).isEqualTo('a')
assertThat("[a].".asEmailSection()[0]).isEqualTo('a')
assertThat("[ a]".asEmailSection()[0]).isEqualTo('a')
assertThat("[abc]".asEmailSection()[0]).isEqualTo('a')
assertThat("[a][b]".asEmailSection()[1]).isEqualTo('b')
assertThat("[a][bc]".asEmailSection()[1]).isEqualTo('b')
assertThat("[ab]".asEmailSection()[1]).isEqualTo('b')
assertThat("[ab][c]".asEmailSection()[1]).isEqualTo('b')
assertThat("[a][b][c]".asEmailSection()[1]).isEqualTo('b')
assertThat(".[a][b][c]".asEmailSection()[1]).isEqualTo('b')
assertThat(".[a].[b][c]".asEmailSection()[1]).isEqualTo('b')
assertThat(".[a].[b].[c]".asEmailSection()[1]).isEqualTo('b')
assertThat("[ a][ b][ c]".asEmailSection()[1]).isEqualTo('b')
assertThat("[a]..[bc]".asEmailSection()[1]).isEqualTo('b')
assertThat("[abc]".asEmailSection()[2]).isEqualTo('c')
assertThat("[ab][c]".asEmailSection()[2]).isEqualTo('c')
assertThat("[a][bc]".asEmailSection()[2]).isEqualTo('c')
assertThat("[a][b][c]".asEmailSection()[2]).isEqualTo('c')
assertThat(".[a].[b].[c].".asEmailSection()[2]).isEqualTo('c')
assertThat("[ a][ b][ c]".asEmailSection()[2]).isEqualTo('c')
}
@Test
fun length() {
assertThat("[]".asEmailSection().length).isEqualTo(0)
assertThat("...[]...".asEmailSection().length).isEqualTo(0)
assertThat("[ ]".asEmailSection().length).isEqualTo(0)
assertThat("[ ][ ]".asEmailSection().length).isEqualTo(1)
assertThat("[One]".asEmailSection().length).isEqualTo(3)
assertThat("[One][Two]".asEmailSection().length).isEqualTo(6)
}
@Test
fun subSequence() {
val section = "[ One][ Two][ Three]".asEmailSection()
assertThat(section.subSequence(0, 11)).isSameAs(section)
assertThat(section.subSequence(0, 3).asString()).isEqualTo("One")
assertThat(section.subSequence(0, 2).asString()).isEqualTo("On")
assertThat(section.subSequence(1, 3).asString()).isEqualTo("ne")
assertThat(section.subSequence(1, 2).asString()).isEqualTo("n")
assertThat(section.subSequence(0, 4).asString()).isEqualTo("OneT")
assertThat(section.subSequence(1, 4).asString()).isEqualTo("neT")
assertThat(section.subSequence(1, 6).asString()).isEqualTo("neTwo")
assertThat(section.subSequence(1, 7).asString()).isEqualTo("neTwoT")
assertThat(section.subSequence(1, 11).asString()).isEqualTo("neTwoThree")
assertThat(section.subSequence(3, 11).asString()).isEqualTo("TwoThree")
assertThat(section.subSequence(4, 11).asString()).isEqualTo("woThree")
assertThat(section.subSequence(4, 9).asString()).isEqualTo("woThr")
assertThat(section.subSequence(6, 9).asString()).isEqualTo("Thr")
assertThat(section.subSequence(7, 10).asString()).isEqualTo("hre")
assertThat(section.subSequence(6, 11).asString()).isEqualTo("Three")
}
private fun CharSequence.asString() = StringBuilder(length).apply {
this@asString.forEach { append(it) }
}.toString()
private fun String.asEmailSection(): EmailSection {
val builder = EmailSection.Builder(this, 0)
var startIndex = -1
var isStartOfLine = true
var spaces = 0
this.forEachIndexed { index, c ->
when (c) {
'[' -> {
startIndex = index + 1
isStartOfLine = true
spaces = 0
}
' ' -> if (isStartOfLine) spaces++
']' -> builder.addSegment(spaces, startIndex, index)
else -> isStartOfLine = false
}
}
return builder.build()
}
}