Merge pull request #5765 from k9mail/decode_ISO-2022-JP
Properly decode multiple encoded-words using ISO-2022-JP
This commit is contained in:
commit
e18e617957
6 changed files with 241 additions and 217 deletions
|
@ -19,6 +19,7 @@ dependencies {
|
|||
testImplementation "com.google.truth:truth:${versions.truth}"
|
||||
testImplementation "org.mockito:mockito-core:${versions.mockito}"
|
||||
testImplementation "org.mockito.kotlin:mockito-kotlin:${versions.mockitoKotlin}"
|
||||
testImplementation "com.ibm.icu:icu4j-charset:70.1"
|
||||
}
|
||||
|
||||
android {
|
||||
|
|
|
@ -1,217 +0,0 @@
|
|||
|
||||
package com.fsck.k9.mail.internet;
|
||||
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import com.fsck.k9.mail.Message;
|
||||
import com.fsck.k9.mail.MessagingException;
|
||||
import okio.Buffer;
|
||||
import okio.ByteString;
|
||||
import okio.Okio;
|
||||
import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
|
||||
import org.apache.james.mime4j.util.CharsetUtil;
|
||||
import timber.log.Timber;
|
||||
|
||||
|
||||
/**
|
||||
* Static methods for decoding strings, byte arrays and encoded words.
|
||||
*
|
||||
* This class is copied from the org.apache.james.mime4j.decoder.DecoderUtil class. It's modified here in order to
|
||||
* decode emoji characters in the Subject headers. The method to decode emoji depends on the MimeMessage class because
|
||||
* it has to be determined with the sender address, the mailer and so on.
|
||||
*/
|
||||
class DecoderUtil {
|
||||
/**
|
||||
* Decodes a string containing encoded words as defined by RFC 2047.
|
||||
* Encoded words in have the form
|
||||
* =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for
|
||||
* quoted-printable and 'B' or 'b' for Base64.
|
||||
*
|
||||
* ANDROID: COPIED FROM A NEWER VERSION OF MIME4J
|
||||
*
|
||||
* @param body the string to decode.
|
||||
* @param message the message which has the string.
|
||||
* @return the decoded string.
|
||||
*/
|
||||
public static String decodeEncodedWords(String body, Message message) {
|
||||
|
||||
// ANDROID: Most strings will not include "=?" so a quick test can prevent unneeded
|
||||
// object creation. This could also be handled via lazy creation of the StringBuilder.
|
||||
if (!body.contains("=?")) {
|
||||
return body;
|
||||
}
|
||||
|
||||
EncodedWord previousWord = null;
|
||||
int previousEnd = 0;
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
while (true) {
|
||||
int begin = body.indexOf("=?", previousEnd);
|
||||
if (begin == -1) {
|
||||
decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// ANDROID: The mime4j original version has an error here. It gets confused if
|
||||
// the encoded string begins with an '=' (just after "?Q?"). This patch seeks forward
|
||||
// to find the two '?' in the "header", before looking for the final "?=".
|
||||
int qm1 = body.indexOf('?', begin + 2);
|
||||
if (qm1 == -1) {
|
||||
decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
int qm2 = body.indexOf('?', qm1 + 1);
|
||||
if (qm2 == -1) {
|
||||
decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
int end = body.indexOf("?=", qm2 + 1);
|
||||
if (end == -1) {
|
||||
decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
|
||||
return sb.toString();
|
||||
}
|
||||
end += 2;
|
||||
|
||||
String sep = body.substring(previousEnd, begin);
|
||||
|
||||
EncodedWord word = extractEncodedWord(body, begin, end, message);
|
||||
|
||||
if (previousWord == null) {
|
||||
sb.append(sep);
|
||||
if (word == null) {
|
||||
sb.append(body, begin, end);
|
||||
}
|
||||
} else {
|
||||
if (word == null) {
|
||||
sb.append(charsetDecode(previousWord));
|
||||
sb.append(sep);
|
||||
sb.append(body, begin, end);
|
||||
} else {
|
||||
if (!CharsetUtil.isWhitespace(sep)) {
|
||||
sb.append(charsetDecode(previousWord));
|
||||
sb.append(sep);
|
||||
} else if (previousWord.encoding.equals(word.encoding) &&
|
||||
previousWord.charset.equals(word.charset)) {
|
||||
word.data = concat(previousWord.data, word.data);
|
||||
} else {
|
||||
sb.append(charsetDecode(previousWord));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
previousWord = word;
|
||||
previousEnd = end;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decodePreviousAndAppendSuffix(StringBuilder sb, EncodedWord previousWord, String body,
|
||||
int previousEnd) {
|
||||
|
||||
if (previousWord != null) {
|
||||
sb.append(charsetDecode(previousWord));
|
||||
}
|
||||
|
||||
sb.append(body.substring(previousEnd));
|
||||
}
|
||||
|
||||
private static String charsetDecode(EncodedWord word) {
|
||||
try {
|
||||
InputStream inputStream = new Buffer().write(word.data).inputStream();
|
||||
return CharsetSupport.readToString(inputStream, word.charset);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static EncodedWord extractEncodedWord(String body, int begin, int end, Message message) {
|
||||
int qm1 = body.indexOf('?', begin + 2);
|
||||
if (qm1 == end - 2)
|
||||
return null;
|
||||
|
||||
int qm2 = body.indexOf('?', qm1 + 1);
|
||||
if (qm2 == end - 2)
|
||||
return null;
|
||||
|
||||
// Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
|
||||
String charsetPart = body.substring(begin + 2, qm1);
|
||||
int languageSuffixStart = charsetPart.indexOf('*');
|
||||
boolean languageSuffixFound = languageSuffixStart != -1;
|
||||
String mimeCharset = languageSuffixFound ? charsetPart.substring(0, languageSuffixStart) : charsetPart;
|
||||
|
||||
String encoding = body.substring(qm1 + 1, qm2);
|
||||
String encodedText = body.substring(qm2 + 1, end - 2);
|
||||
|
||||
String charset;
|
||||
try {
|
||||
charset = CharsetSupport.fixupCharset(mimeCharset, message);
|
||||
} catch (MessagingException e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (encodedText.isEmpty()) {
|
||||
Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end));
|
||||
return null;
|
||||
}
|
||||
|
||||
EncodedWord encodedWord = new EncodedWord();
|
||||
encodedWord.charset = charset;
|
||||
if (encoding.equalsIgnoreCase("Q")) {
|
||||
encodedWord.encoding = "Q";
|
||||
encodedWord.data = decodeQ(encodedText);
|
||||
} else if (encoding.equalsIgnoreCase("B")) {
|
||||
encodedWord.encoding = "B";
|
||||
encodedWord.data = decodeB(encodedText);
|
||||
} else {
|
||||
Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end));
|
||||
return null;
|
||||
}
|
||||
return encodedWord;
|
||||
}
|
||||
|
||||
private static ByteString decodeQ(String encodedWord) {
|
||||
/*
|
||||
* Replace _ with =20
|
||||
*/
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < encodedWord.length(); i++) {
|
||||
char c = encodedWord.charAt(i);
|
||||
if (c == '_') {
|
||||
sb.append("=20");
|
||||
} else {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
|
||||
byte[] bytes = sb.toString().getBytes(Charset.forName("US-ASCII"));
|
||||
|
||||
QuotedPrintableInputStream is = new QuotedPrintableInputStream(new ByteArrayInputStream(bytes));
|
||||
try {
|
||||
return Okio.buffer(Okio.source(is)).readByteString();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static ByteString decodeB(String encodedText) {
|
||||
ByteString decoded = ByteString.decodeBase64(encodedText);
|
||||
return decoded == null ? ByteString.EMPTY : decoded;
|
||||
}
|
||||
|
||||
private static ByteString concat(ByteString first, ByteString second) {
|
||||
return new Buffer().write(first).write(second).readByteString();
|
||||
}
|
||||
|
||||
|
||||
private static class EncodedWord {
|
||||
private String charset;
|
||||
private String encoding;
|
||||
private ByteString data;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
package com.fsck.k9.mail.internet
|
||||
|
||||
import com.fsck.k9.mail.Message
|
||||
import com.fsck.k9.mail.MessagingException
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.io.IOException
|
||||
import okio.Buffer
|
||||
import okio.ByteString
|
||||
import okio.ByteString.Companion.decodeBase64
|
||||
import okio.buffer
|
||||
import okio.source
|
||||
import org.apache.james.mime4j.codec.QuotedPrintableInputStream
|
||||
import org.apache.james.mime4j.util.CharsetUtil
|
||||
import timber.log.Timber
|
||||
|
||||
/**
|
||||
* Decoder for encoded words (RFC 2047).
|
||||
*
|
||||
* This class is based on `org.apache.james.mime4j.decoder.DecoderUtil`. It was modified in order to support early
|
||||
* non-Unicode emoji variants.
|
||||
*/
|
||||
internal object DecoderUtil {
|
||||
/**
|
||||
* Decodes a string containing encoded words as defined by RFC 2047.
|
||||
*
|
||||
* Encoded words have the form `=?charset?enc?Encoded word?=` where `enc` is either 'Q' or 'q' for
|
||||
* quoted-printable and 'B' or 'b' for Base64.
|
||||
*
|
||||
* @param body The string to decode.
|
||||
* @param message The message containing the string. It will be used to figure out which JIS variant to use for
|
||||
* charset decoding. May be `null`.
|
||||
* @return The decoded string.
|
||||
*/
|
||||
@JvmStatic
|
||||
fun decodeEncodedWords(body: String, message: Message?): String {
|
||||
// Most strings will not include "=?". So a quick test can prevent unneeded work.
|
||||
if (!body.contains("=?")) return body
|
||||
|
||||
var previousWord: EncodedWord? = null
|
||||
var previousEnd = 0
|
||||
val output = StringBuilder()
|
||||
|
||||
while (true) {
|
||||
val begin = body.indexOf("=?", previousEnd)
|
||||
if (begin == -1) {
|
||||
decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
|
||||
return output.toString()
|
||||
}
|
||||
|
||||
val qm1 = body.indexOf('?', begin + 2)
|
||||
if (qm1 == -1) {
|
||||
decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
|
||||
return output.toString()
|
||||
}
|
||||
|
||||
val qm2 = body.indexOf('?', qm1 + 1)
|
||||
if (qm2 == -1) {
|
||||
decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
|
||||
return output.toString()
|
||||
}
|
||||
|
||||
var end = body.indexOf("?=", qm2 + 1)
|
||||
if (end == -1) {
|
||||
decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
|
||||
return output.toString()
|
||||
}
|
||||
end += 2
|
||||
|
||||
val sep = body.substring(previousEnd, begin)
|
||||
val word = extractEncodedWord(body, begin, end, message)
|
||||
|
||||
if (previousWord == null) {
|
||||
output.append(sep)
|
||||
if (word == null) {
|
||||
output.append(body, begin, end)
|
||||
}
|
||||
} else if (word == null) {
|
||||
output.append(charsetDecode(previousWord))
|
||||
output.append(sep)
|
||||
output.append(body, begin, end)
|
||||
} else if (!CharsetUtil.isWhitespace(sep)) {
|
||||
output.append(charsetDecode(previousWord))
|
||||
output.append(sep)
|
||||
} else if (previousWord.canBeCombinedWith(word)) {
|
||||
word.data = previousWord.data + word.data
|
||||
} else {
|
||||
output.append(charsetDecode(previousWord))
|
||||
}
|
||||
|
||||
previousWord = word
|
||||
previousEnd = end
|
||||
}
|
||||
}
|
||||
|
||||
private fun decodePreviousAndAppendSuffix(
|
||||
output: StringBuilder,
|
||||
previousWord: EncodedWord?,
|
||||
body: String,
|
||||
previousEnd: Int
|
||||
) {
|
||||
if (previousWord != null) {
|
||||
output.append(charsetDecode(previousWord))
|
||||
}
|
||||
output.append(body, previousEnd, body.length)
|
||||
}
|
||||
|
||||
private fun charsetDecode(word: EncodedWord): String? {
|
||||
return try {
|
||||
val inputStream = Buffer().write(word.data).inputStream()
|
||||
CharsetSupport.readToString(inputStream, word.charset)
|
||||
} catch (e: IOException) {
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private fun extractEncodedWord(body: String, begin: Int, end: Int, message: Message?): EncodedWord? {
|
||||
val qm1 = body.indexOf('?', begin + 2)
|
||||
if (qm1 == end - 2) return null
|
||||
|
||||
val qm2 = body.indexOf('?', qm1 + 1)
|
||||
if (qm2 == end - 2) return null
|
||||
|
||||
// Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
|
||||
val charsetPart = body.substring(begin + 2, qm1)
|
||||
val languageSuffixStart = charsetPart.indexOf('*')
|
||||
val languageSuffixFound = languageSuffixStart != -1
|
||||
val mimeCharset = if (languageSuffixFound) charsetPart.substring(0, languageSuffixStart) else charsetPart
|
||||
|
||||
val encoding = body.substring(qm1 + 1, qm2)
|
||||
val encodedText = body.substring(qm2 + 1, end - 2)
|
||||
|
||||
val charset = try {
|
||||
CharsetSupport.fixupCharset(mimeCharset, message)
|
||||
} catch (e: MessagingException) {
|
||||
return null
|
||||
}
|
||||
|
||||
if (encodedText.isEmpty()) {
|
||||
Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end))
|
||||
return null
|
||||
}
|
||||
|
||||
return if (encoding.equals("Q", ignoreCase = true)) {
|
||||
EncodedWord(charset, Encoding.Q, decodeQ(encodedText))
|
||||
} else if (encoding.equals("B", ignoreCase = true)) {
|
||||
EncodedWord(charset, Encoding.B, decodeB(encodedText))
|
||||
} else {
|
||||
Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end))
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private fun decodeQ(encodedWord: String): ByteString {
|
||||
// Replace _ with =20
|
||||
val bytes = buildString {
|
||||
for (character in encodedWord) {
|
||||
if (character == '_') {
|
||||
append("=20")
|
||||
} else {
|
||||
append(character)
|
||||
}
|
||||
}
|
||||
}.toByteArray(Charsets.US_ASCII)
|
||||
|
||||
return QuotedPrintableInputStream(ByteArrayInputStream(bytes)).use { inputStream ->
|
||||
try {
|
||||
inputStream.source().buffer().readByteString()
|
||||
} catch (e: IOException) {
|
||||
ByteString.EMPTY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun decodeB(encodedText: String): ByteString {
|
||||
return encodedText.decodeBase64() ?: ByteString.EMPTY
|
||||
}
|
||||
|
||||
private operator fun ByteString.plus(second: ByteString): ByteString {
|
||||
return Buffer().write(this).write(second).readByteString()
|
||||
}
|
||||
|
||||
private val ASCII_ESCAPE_SEQUENCE = byteArrayOf(0x1B, 0x28, 0x42)
|
||||
|
||||
private class EncodedWord(
|
||||
val charset: String,
|
||||
val encoding: Encoding,
|
||||
var data: ByteString
|
||||
) {
|
||||
fun canBeCombinedWith(other: EncodedWord): Boolean {
|
||||
return encoding == other.encoding && charset == other.charset && !isAsciiEscapeSequence()
|
||||
}
|
||||
|
||||
private fun isAsciiEscapeSequence(): Boolean {
|
||||
return charset.startsWith("ISO-2022-JP", ignoreCase = true) && data.endsWith(ASCII_ESCAPE_SEQUENCE)
|
||||
}
|
||||
}
|
||||
|
||||
private enum class Encoding {
|
||||
Q, B
|
||||
}
|
||||
}
|
|
@ -220,6 +220,16 @@ public class DecoderUtilTest {
|
|||
assertInputDecodesToExpected("=?utf-8*de?b?R3LDvMOfZQ==?=", "Grüße");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeEncodedWords_withMultipleIso2022JpEncodedWordsProperlyEndingWithSwitchingToAscii() {
|
||||
// If we try to combine the base64-decoded data of both encoded words and only then perform the charset
|
||||
// decoding, we end up with an escape sequence switching to ASCII (end of first encoded word) followed by an
|
||||
// escape sequence switching to JIS X 0208:1983 (start of second encoded word). The decoder on Android reports
|
||||
// an error for this case, leading to a replacement character being inserted.
|
||||
// We use the ISO-2022-JP-TEST charset to get Android's behavior on the JVM. See TestCharsetProvider.
|
||||
assertInputDecodesToExpected("=?ISO-2022-JP-TEST?B?GyRCRnxLXDhsJEhGfEtcOGwkSEZ8S1w4bCROJUElJyVDGyhC?=\r\n" +
|
||||
" =?ISO-2022-JP-TEST?B?GyRCJS8bKEI=?=", "日本語と日本語と日本語のチェック");
|
||||
}
|
||||
|
||||
private void assertInputDecodesToExpected(String input, String expected) {
|
||||
String decodedText = DecoderUtil.decodeEncodedWords(input, null);
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package com.fsck.k9.mail.internet
|
||||
|
||||
import com.ibm.icu.charset.CharsetProviderICU
|
||||
import java.nio.charset.Charset
|
||||
import java.nio.charset.spi.CharsetProvider
|
||||
|
||||
/**
|
||||
* CharsetProvider that adds the "ISO-2022-JP-TEST" charset.
|
||||
*
|
||||
* The "ISO-2022-JP" decoder on the JVM is more lenient than the ICU4J decoder that is used on Android. For tests we
|
||||
* use the ICU4J implementation that is also used on Android.
|
||||
*/
|
||||
class TestCharsetProvider : CharsetProvider() {
|
||||
private val icuCharsetProvider = CharsetProviderICU()
|
||||
private val charset = icuCharsetProvider.charsetForName("ISO-2022-JP")
|
||||
|
||||
override fun charsets(): Iterator<Charset> {
|
||||
return listOf(charset).iterator()
|
||||
}
|
||||
|
||||
override fun charsetForName(charsetName: String?): Charset? {
|
||||
return if (charsetName?.equals("ISO-2022-JP-TEST", ignoreCase = true) == true) {
|
||||
charset
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
com.fsck.k9.mail.internet.TestCharsetProvider
|
Loading…
Reference in a new issue