Take special care when decoding encoded words with charset ISO-2022-JP

This commit is contained in:
cketti 2021-11-03 13:56:35 +01:00
parent 94548c11a8
commit 9861fc4d5a
5 changed files with 49 additions and 3 deletions

View file

@ -19,6 +19,7 @@ dependencies {
testImplementation "com.google.truth:truth:${versions.truth}"
testImplementation "org.mockito:mockito-core:${versions.mockito}"
testImplementation "org.mockito.kotlin:mockito-kotlin:${versions.mockitoKotlin}"
testImplementation "com.ibm.icu:icu4j-charset:70.1"
}
android {

View file

@ -81,7 +81,7 @@ internal object DecoderUtil {
} else if (!CharsetUtil.isWhitespace(sep)) {
output.append(charsetDecode(previousWord))
output.append(sep)
} else if (previousWord.isTypeEqualTo(word)) {
} else if (previousWord.canBeCombinedWith(word)) {
word.data = previousWord.data + word.data
} else {
output.append(charsetDecode(previousWord))
@ -179,13 +179,19 @@ internal object DecoderUtil {
return Buffer().write(this).write(second).readByteString()
}
private val ASCII_ESCAPE_SEQUENCE = byteArrayOf(0x1B, 0x28, 0x42)
private class EncodedWord(
val charset: String,
val encoding: Encoding,
var data: ByteString
) {
fun isTypeEqualTo(other: EncodedWord): Boolean {
return encoding == other.encoding && charset == other.charset
fun canBeCombinedWith(other: EncodedWord): Boolean {
return encoding == other.encoding && charset == other.charset && !isAsciiEscapeSequence()
}
private fun isAsciiEscapeSequence(): Boolean {
return charset.startsWith("ISO-2022-JP", ignoreCase = true) && data.endsWith(ASCII_ESCAPE_SEQUENCE)
}
}

View file

@ -220,6 +220,16 @@ public class DecoderUtilTest {
assertInputDecodesToExpected("=?utf-8*de?b?R3LDvMOfZQ==?=", "Grüße");
}
@Test
public void decodeEncodedWords_withMultipleIso2022JpEncodedWordsProperlyEndingWithSwitchingToAscii() {
// If we try to combine the base64-decoded data of both encoded words and only then perform the charset
// decoding, we end up with an escape sequence switching to ASCII (end of first encoded word) followed by an
// escape sequence switching to JIS X 0208:1983 (start of second encoded word). The decoder on Android reports
// an error for this case, leading to a replacement character being inserted.
// We use the ISO-2022-JP-TEST charset to get Android's behavior on the JVM. See TestCharsetProvider.
assertInputDecodesToExpected("=?ISO-2022-JP-TEST?B?GyRCRnxLXDhsJEhGfEtcOGwkSEZ8S1w4bCROJUElJyVDGyhC?=\r\n" +
" =?ISO-2022-JP-TEST?B?GyRCJS8bKEI=?=", "日本語と日本語と日本語のチェック");
}
private void assertInputDecodesToExpected(String input, String expected) {
String decodedText = DecoderUtil.decodeEncodedWords(input, null);

View file

@ -0,0 +1,28 @@
package com.fsck.k9.mail.internet
import com.ibm.icu.charset.CharsetProviderICU
import java.nio.charset.Charset
import java.nio.charset.spi.CharsetProvider
/**
* CharsetProvider that adds the "ISO-2022-JP-TEST" charset.
*
* The "ISO-2022-JP" decoder on the JVM is more lenient than the ICU4J decoder that is used on Android. For tests we
* use the ICU4J implementation that is also used on Android.
*/
class TestCharsetProvider : CharsetProvider() {
private val icuCharsetProvider = CharsetProviderICU()
private val charset = icuCharsetProvider.charsetForName("ISO-2022-JP")
override fun charsets(): Iterator<Charset> {
return listOf(charset).iterator()
}
override fun charsetForName(charsetName: String?): Charset? {
return if (charsetName?.equals("ISO-2022-JP-TEST", ignoreCase = true) == true) {
charset
} else {
null
}
}
}

View file

@ -0,0 +1 @@
com.fsck.k9.mail.internet.TestCharsetProvider