diff --git a/k9mail-library/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.java b/k9mail-library/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.java index 548f2dc08..6fc2e8a8a 100644 --- a/k9mail-library/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.java +++ b/k9mail-library/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.java @@ -21,6 +21,13 @@ import timber.log.Timber; * it has to be determined with the sender address, the mailer and so on. */ class DecoderUtil { + + private static class EncodedWord { + private String charset; + private String encoding; + private String encodedText; + } + /** * Decodes an encoded word encoded with the 'B' encoding (described in * RFC 2047) found in a header field body. @@ -48,7 +55,7 @@ class DecoderUtil { * @param charset the Java charset to use. * @return the decoded string. */ - private static String decodeQ(String encodedWord, String charset) { + static String decodeQ(String encodedWord, String charset) { /* * Replace _ with =20 @@ -93,15 +100,15 @@ class DecoderUtil { return body; } + EncodedWord previousWord = null; int previousEnd = 0; - boolean previousWasEncoded = false; StringBuilder sb = new StringBuilder(); while (true) { int begin = body.indexOf("=?", previousEnd); if (begin == -1) { - sb.append(body.substring(previousEnd)); + decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd); return sb.toString(); } @@ -110,43 +117,77 @@ class DecoderUtil { // to find the two '?' in the "header", before looking for the final "?=". int qm1 = body.indexOf('?', begin + 2); if (qm1 == -1) { - sb.append(body.substring(previousEnd)); + decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd); return sb.toString(); } int qm2 = body.indexOf('?', qm1 + 1); if (qm2 == -1) { - sb.append(body.substring(previousEnd)); + decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd); return sb.toString(); } int end = body.indexOf("?=", qm2 + 1); if (end == -1) { - sb.append(body.substring(previousEnd)); + decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd); return sb.toString(); } end += 2; String sep = body.substring(previousEnd, begin); - String decoded = decodeEncodedWord(body, begin, end, message); - if (decoded == null) { + EncodedWord word = extractEncodedWord(body, begin, end, message); + + if (previousWord == null) { sb.append(sep); - sb.append(body.substring(begin, end)); - } else { - if (!previousWasEncoded || !CharsetUtil.isWhitespace(sep)) { - sb.append(sep); + if (word == null) { + sb.append(body.substring(begin, end)); + } + } else { + if (word == null) { + sb.append(decodeEncodedWord(previousWord)); + sb.append(sep); + sb.append(body.substring(begin, end)); + } else { + if (!CharsetUtil.isWhitespace(sep)) { + sb.append(decodeEncodedWord(previousWord)); + sb.append(sep); + } else if (previousWord.encoding.equals(word.encoding) && + previousWord.charset.equals(word.charset)) { + word.encodedText = previousWord.encodedText + word.encodedText; + } else { + sb.append(decodeEncodedWord(previousWord)); + } } - sb.append(decoded); } + previousWord = word; previousEnd = end; - previousWasEncoded = decoded != null; } } - // return null on error - private static String decodeEncodedWord(String body, int begin, int end, Message message) { + private static void decodePreviousAndAppendSuffix(StringBuilder sb, EncodedWord previousWord, String body, + int previousEnd) { + + if (previousWord != null) { + sb.append(decodeEncodedWord(previousWord)); + } + + sb.append(body.substring(previousEnd)); + } + + private static String decodeEncodedWord(EncodedWord word) { + if (word.encoding.equals("Q")) { + return decodeQ(word.encodedText, word.charset); + } else if (word.encoding.equals("B")) { + return DecoderUtil.decodeB(word.encodedText, word.charset); + } else { + Timber.w("Warning: Unknown encoding '%s'", word.encoding); + return null; + } + } + + private static EncodedWord extractEncodedWord(String body, int begin, int end, Message message) { int qm1 = body.indexOf('?', begin + 2); if (qm1 == end - 2) return null; @@ -171,13 +212,17 @@ class DecoderUtil { return null; } + EncodedWord encodedWord = new EncodedWord(); + encodedWord.charset = charset; if (encoding.equalsIgnoreCase("Q")) { - return decodeQ(encodedText, charset); + encodedWord.encoding = "Q"; } else if (encoding.equalsIgnoreCase("B")) { - return DecoderUtil.decodeB(encodedText, charset); + encodedWord.encoding = "B"; } else { Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end)); return null; } + encodedWord.encodedText = encodedText; + return encodedWord; } } diff --git a/k9mail-library/src/test/java/com/fsck/k9/mail/internet/DecoderUtilTest.java b/k9mail-library/src/test/java/com/fsck/k9/mail/internet/DecoderUtilTest.java index cd693c3be..a0c205e7e 100644 --- a/k9mail-library/src/test/java/com/fsck/k9/mail/internet/DecoderUtilTest.java +++ b/k9mail-library/src/test/java/com/fsck/k9/mail/internet/DecoderUtilTest.java @@ -11,117 +11,207 @@ import static org.junit.Assert.assertEquals; @RunWith(K9LibRobolectricTestRunner.class) public class DecoderUtilTest { + private static final String INVALID = "=?utf-8?Q??="; + @Test - public void testDecodeEncodedWords() { - String body, expect; - MimeMessage message; + public void decodeEncodedWords_withInvalidEncodedWord_shouldReturnInputText() { + // We use INVALID as instance of an invalid encoded word in tests. If at some point we decide to change the code + // to recognize empty encoded text as valid and decode it to an empty string, a lot of tests will break. + // Hopefully this test will help the developer figure out why the other tests broke. + assertInputDecodesToExpected(INVALID, INVALID); + } - body = "abc"; - expect = "abc"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_with_unencoded_data_returns_original_text() { + assertInputDecodesToExpected("abc", "abc"); + } - body = "=?us-ascii?q?abc?="; - expect = "abc"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withAsciiCharset_encoded_data_returns_text() { + assertInputDecodesToExpected("=?us-ascii?q?abc?=", "abc"); + } - body = "=?"; - expect = "=?"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withStartOnly_encoding_format_returnAsText() { + assertInputDecodesToExpected("=?", "=?"); + } - body = "=??"; - expect = "=??"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withEncodedWordAndOnlyStartOfEncodedWord_shouldDecodeAndAddSuffix() { + assertInputDecodesToExpected("=?utf-8?Q?abc?= =?", "abc =?"); + } - body = "=???"; - expect = "=???"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withStartAndSeparatorOnly_returnAsText() { + assertInputDecodesToExpected("=??", "=??"); + } - body = "=????"; - expect = "=????"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withEncodedWordAndOnlyStartAndSeparatorOfEncodedWord_shouldDecodeAndAddSuffix() { + assertInputDecodesToExpected("=?utf-8?Q?abc?= =??", "abc =??"); + } - body = "=????="; - expect = "=????="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withStartAnd2SeparatorOnly_returnAsText() { + assertInputDecodesToExpected("=???", "=???"); + } - body = "=??q??="; - expect = "=??q??="; - ; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withEncodedWordAndOnlyStartAndTwoSeparatorsOfEncodedWord_shouldDecodeAndAddSuffix() { + assertInputDecodesToExpected("=?utf-8?Q?abc?= =???", "abc =???"); + } - body = "=??q?a?="; - expect = "a"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withStartAnd3SeparatorOnly_returnAsText() { + assertInputDecodesToExpected("=????", "=????"); + } - body = "=??="; - expect = "=??="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withEncodedWordAndOnlyStartAndThreeSeparatorsOfEncodedWord_shouldDecodeAndAddSuffix() { + assertInputDecodesToExpected("=?utf-8?Q?abc?= =????", "abc =????"); + } - body = "=?x?="; - expect = "=?x?="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withSeparatorsOnly_returnAsText() { + assertInputDecodesToExpected("=????=", "=????="); + } - body = "=?x??="; - expect = "=?x??="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withMissingCharset_returnAsText() { + assertInputDecodesToExpected("=??q??=", "=??q??="); + } - body = "=?x?q?="; - expect = "=?x?q?="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withTextAndMissingCharset_returnAsText() { + assertInputDecodesToExpected("=??q?a?=", "a"); + } - body = "=?x?q??="; - expect = "=?x?q??="; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withNoTextCharsetOrEncoding_returnAsText() { + assertInputDecodesToExpected("=??=", "=??="); + } - body = "=?x?q?X?="; - expect = "X"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_with_MissingEncodingAndData_returnAsText() { + assertInputDecodesToExpected("=?x?=", "=?x?="); + } - // invalid base64 string - body = "=?us-ascii?b?abc?="; - expect = ""; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withMissingEncoding_returnAsText() { + assertInputDecodesToExpected("=?x??=", "=?x??="); + } - // broken encoded header - body = "=?us-ascii?q?abc?= =?"; - expect = "abc =?"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_with_incompleteEncodingFormat_returnAsText() { + assertInputDecodesToExpected("=?x?q?=", "=?x?q?="); + } - body = "=?x?= =?"; - expect = "=?x?= =?"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_with_unrecognisedEncoding_withEmptyEncodedData_returnAsText() { + assertInputDecodesToExpected("=?x?q??=", "=?x?q??="); + } - //Multi encoded header - body = "=?utf-8?B?5Liq5Lq66YKu566xOkJVRyAjMzAyNDY6OumCruS7tuato+aWh+mZhOS7tuWQ?=\n" + - "=?utf-8?B?jeensOecgeeVpeaYvuekuuS8mOWMlg==?="; - expect = "个人邮箱:BUG #30246::邮件正文附件��称省略显示优化"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withUnrecognisedEncoding_withEncodedData_return_encoded_data() { + assertInputDecodesToExpected("=?x?q?X?=", "X"); + } - //Non utf-8 encoding - body = "=?gb2312?B?Obv9t9az6cnu29rHsLqju6rHyLPHSlfN8rrAvsa16qOsuPzT0DIwvNIzOTnU?= " + - "=?gb2312?B?qr6r0aG439DHytTLr77Gteq1yMTjwLSjoaOoQUSjqQ?="; - expect = "9积分抽深圳前海华侨城JW万豪酒店,更有20家399��精选高星试睡酒店等你来!(AD�"; - message = null; - assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); + @Test + public void decodeEncodedWords_withInvalidBase64String_returnsEmptyString() { + assertInputDecodesToExpected("=?us-ascii?b?abc?=", ""); + } + + @Test + public void decodeEncodedWords_withPartiallyEncoded_returnsBothSections() { + assertInputDecodesToExpected("=?us-ascii?q?abc?= =?", "abc =?"); + } + + @Test + public void decodeEncodedWords_withPartiallyEncodedAfter_returnsBothSections() { + assertInputDecodesToExpected("def=?us-ascii?q?abc?=", "defabc"); + } + + @Test + public void decodeEncodedWords_withUnrecognisedCharset_returnsEncodedData() { + assertInputDecodesToExpected("=?x?= =?", "=?x?= =?"); + } + + @Test + public void decodeEncodedWords_withMultipleEncodedSections_decodesBoth() { + assertInputDecodesToExpected("=?us-ascii?q?abc?= =?us-ascii?q?def?=", "abcdef"); + } + + @Test + public void decodeEncodedWords_withMultipleEncodedSections_decodesSequentialSectionTogether() { + // Splitting mid-character is RFC2047 non-compliant but seen in practice. + // "=?utf-8?B?b2hhaSDw?=" individually decodes to "ohai �" + // "=?utf-8?B?n5Kp==?=" individually decodes to "���" + // (invalid bytes in a UTF-8 sequence are replaced with the replacement character) + assertInputDecodesToExpected("=?utf-8?B?b2hhaSDw?= =?utf-8?B?n5Kp?=", "ohai 💩"); + } + + @Test + public void decodeEncodedWords_withMultipleEncodedSectionsButCharsetAndEncodingDifferingInCase_decodesSequentialSectionTogether() { + assertInputDecodesToExpected("=?utf-8?B?b2hhaSDw?= =?UTF-8?b?n5Kp?=", "ohai 💩"); + } + + @Test + public void decodeEncodedWords_withEncodedWordWhitespaceInvalidEncodedWord_shouldOnlyDecodeEncodedWord() { + assertInputDecodesToExpected("=?utf-8?Q?abc?= " + INVALID, "abc " + INVALID); + } + + @Test + public void decodeEncodedWords_withInvalidEncodedWordWhitespaceInvalidEncodedWord_shouldReturnInputText() { + String input = INVALID + " " + INVALID; + assertInputDecodesToExpected(input, input); + } + + @Test + public void decodeEncodedWords_withEncodedWordNonWhitespaceSeparatorEncodedWord_shouldDecodeBothAndKeepSeparator() { + assertInputDecodesToExpected("=?utf-8?Q?ab?= -- =?utf-8?Q?cd?=", "ab -- cd"); + } + + @Test + public void decodeEncodedWords_withInvalidEncodedWordWhitespaceEncodedWord_shouldOnlyDecodeEncodedWord() { + assertInputDecodesToExpected(INVALID + " =?utf-8?Q?abc?=", INVALID + " abc"); + } + + @Test + public void decodeEncodedWords_withEncodedWordFollowedByEncodedWordWithDifferentEncoding_shouldDecodeIndividually() { + assertInputDecodesToExpected("=?utf-8?Q?ab?= =?utf-8?B?Y2Q=?=", "abcd"); + } + + @Test + public void decodeEncodedWords_withEncodedWordSeparatorEncodedWordWithDifferentEncoding_shouldDecodeIndividuallyAndKeepSeparator() { + assertInputDecodesToExpected("=?utf-8?Q?ab?= / =?utf-8?B?Y2Q=?=", "ab / cd"); + } + + @Test + public void decodeEncodedWords_withEncodedWordFollowedByEncodedWordWithDifferentCharset_shouldDecodeIndividually() { + assertInputDecodesToExpected("=?us-ascii?Q?oh_no_?= =?utf-8?Q?=F0=9F=92=A9?=", "oh no 💩"); + } + + @Test + public void decodeEncodedWords_withRFC2047examples_decodesCorrectly() { + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?=)", "(a)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= b)", "(a b)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", "(ab)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", "(ab)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= \n =?ISO-8859-1?Q?b?=)", "(ab)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a_b?=)", "(a b)"); + + assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)", "(a b)"); + } + + + private void assertInputDecodesToExpected(String input, String expected) { + String decodedText = DecoderUtil.decodeEncodedWords(input, null); + assertEquals(expected, decodedText); } }