Merge pull request #2725 from k9mail/improveDecoderUtil

Handle RFC2047 non-compliant splitting of characters
This commit is contained in:
cketti 2017-11-12 18:36:02 +01:00 committed by GitHub
commit d6090c626b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 243 additions and 108 deletions

View file

@ -21,6 +21,13 @@ import timber.log.Timber;
* it has to be determined with the sender address, the mailer and so on. * it has to be determined with the sender address, the mailer and so on.
*/ */
class DecoderUtil { class DecoderUtil {
private static class EncodedWord {
private String charset;
private String encoding;
private String encodedText;
}
/** /**
* Decodes an encoded word encoded with the 'B' encoding (described in * Decodes an encoded word encoded with the 'B' encoding (described in
* RFC 2047) found in a header field body. * RFC 2047) found in a header field body.
@ -48,7 +55,7 @@ class DecoderUtil {
* @param charset the Java charset to use. * @param charset the Java charset to use.
* @return the decoded string. * @return the decoded string.
*/ */
private static String decodeQ(String encodedWord, String charset) { static String decodeQ(String encodedWord, String charset) {
/* /*
* Replace _ with =20 * Replace _ with =20
@ -93,15 +100,15 @@ class DecoderUtil {
return body; return body;
} }
EncodedWord previousWord = null;
int previousEnd = 0; int previousEnd = 0;
boolean previousWasEncoded = false;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
while (true) { while (true) {
int begin = body.indexOf("=?", previousEnd); int begin = body.indexOf("=?", previousEnd);
if (begin == -1) { if (begin == -1) {
sb.append(body.substring(previousEnd)); decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
return sb.toString(); return sb.toString();
} }
@ -110,43 +117,77 @@ class DecoderUtil {
// to find the two '?' in the "header", before looking for the final "?=". // to find the two '?' in the "header", before looking for the final "?=".
int qm1 = body.indexOf('?', begin + 2); int qm1 = body.indexOf('?', begin + 2);
if (qm1 == -1) { if (qm1 == -1) {
sb.append(body.substring(previousEnd)); decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
return sb.toString(); return sb.toString();
} }
int qm2 = body.indexOf('?', qm1 + 1); int qm2 = body.indexOf('?', qm1 + 1);
if (qm2 == -1) { if (qm2 == -1) {
sb.append(body.substring(previousEnd)); decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
return sb.toString(); return sb.toString();
} }
int end = body.indexOf("?=", qm2 + 1); int end = body.indexOf("?=", qm2 + 1);
if (end == -1) { if (end == -1) {
sb.append(body.substring(previousEnd)); decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
return sb.toString(); return sb.toString();
} }
end += 2; end += 2;
String sep = body.substring(previousEnd, begin); String sep = body.substring(previousEnd, begin);
String decoded = decodeEncodedWord(body, begin, end, message); EncodedWord word = extractEncodedWord(body, begin, end, message);
if (decoded == null) {
if (previousWord == null) {
sb.append(sep);
if (word == null) {
sb.append(body.substring(begin, end));
}
} else {
if (word == null) {
sb.append(decodeEncodedWord(previousWord));
sb.append(sep); sb.append(sep);
sb.append(body.substring(begin, end)); sb.append(body.substring(begin, end));
} else { } else {
if (!previousWasEncoded || !CharsetUtil.isWhitespace(sep)) { if (!CharsetUtil.isWhitespace(sep)) {
sb.append(decodeEncodedWord(previousWord));
sb.append(sep); sb.append(sep);
} else if (previousWord.encoding.equals(word.encoding) &&
previousWord.charset.equals(word.charset)) {
word.encodedText = previousWord.encodedText + word.encodedText;
} else {
sb.append(decodeEncodedWord(previousWord));
}
} }
sb.append(decoded);
} }
previousWord = word;
previousEnd = end; previousEnd = end;
previousWasEncoded = decoded != null;
} }
} }
// return null on error private static void decodePreviousAndAppendSuffix(StringBuilder sb, EncodedWord previousWord, String body,
private static String decodeEncodedWord(String body, int begin, int end, Message message) { int previousEnd) {
if (previousWord != null) {
sb.append(decodeEncodedWord(previousWord));
}
sb.append(body.substring(previousEnd));
}
private static String decodeEncodedWord(EncodedWord word) {
if (word.encoding.equals("Q")) {
return decodeQ(word.encodedText, word.charset);
} else if (word.encoding.equals("B")) {
return DecoderUtil.decodeB(word.encodedText, word.charset);
} else {
Timber.w("Warning: Unknown encoding '%s'", word.encoding);
return null;
}
}
private static EncodedWord extractEncodedWord(String body, int begin, int end, Message message) {
int qm1 = body.indexOf('?', begin + 2); int qm1 = body.indexOf('?', begin + 2);
if (qm1 == end - 2) if (qm1 == end - 2)
return null; return null;
@ -171,13 +212,17 @@ class DecoderUtil {
return null; return null;
} }
EncodedWord encodedWord = new EncodedWord();
encodedWord.charset = charset;
if (encoding.equalsIgnoreCase("Q")) { if (encoding.equalsIgnoreCase("Q")) {
return decodeQ(encodedText, charset); encodedWord.encoding = "Q";
} else if (encoding.equalsIgnoreCase("B")) { } else if (encoding.equalsIgnoreCase("B")) {
return DecoderUtil.decodeB(encodedText, charset); encodedWord.encoding = "B";
} else { } else {
Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end)); Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end));
return null; return null;
} }
encodedWord.encodedText = encodedText;
return encodedWord;
} }
} }

View file

@ -11,117 +11,207 @@ import static org.junit.Assert.assertEquals;
@RunWith(K9LibRobolectricTestRunner.class) @RunWith(K9LibRobolectricTestRunner.class)
public class DecoderUtilTest { public class DecoderUtilTest {
private static final String INVALID = "=?utf-8?Q??=";
@Test @Test
public void testDecodeEncodedWords() { public void decodeEncodedWords_withInvalidEncodedWord_shouldReturnInputText() {
String body, expect; // We use INVALID as instance of an invalid encoded word in tests. If at some point we decide to change the code
MimeMessage message; // to recognize empty encoded text as valid and decode it to an empty string, a lot of tests will break.
// Hopefully this test will help the developer figure out why the other tests broke.
assertInputDecodesToExpected(INVALID, INVALID);
}
body = "abc"; @Test
expect = "abc"; public void decodeEncodedWords_with_unencoded_data_returns_original_text() {
message = null; assertInputDecodesToExpected("abc", "abc");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?us-ascii?q?abc?="; @Test
expect = "abc"; public void decodeEncodedWords_withAsciiCharset_encoded_data_returns_text() {
message = null; assertInputDecodesToExpected("=?us-ascii?q?abc?=", "abc");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?"; @Test
expect = "=?"; public void decodeEncodedWords_withStartOnly_encoding_format_returnAsText() {
message = null; assertInputDecodesToExpected("=?", "=?");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=??"; @Test
expect = "=??"; public void decodeEncodedWords_withEncodedWordAndOnlyStartOfEncodedWord_shouldDecodeAndAddSuffix() {
message = null; assertInputDecodesToExpected("=?utf-8?Q?abc?= =?", "abc =?");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=???"; @Test
expect = "=???"; public void decodeEncodedWords_withStartAndSeparatorOnly_returnAsText() {
message = null; assertInputDecodesToExpected("=??", "=??");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=????"; @Test
expect = "=????"; public void decodeEncodedWords_withEncodedWordAndOnlyStartAndSeparatorOfEncodedWord_shouldDecodeAndAddSuffix() {
message = null; assertInputDecodesToExpected("=?utf-8?Q?abc?= =??", "abc =??");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=????="; @Test
expect = "=????="; public void decodeEncodedWords_withStartAnd2SeparatorOnly_returnAsText() {
message = null; assertInputDecodesToExpected("=???", "=???");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=??q??="; @Test
expect = "=??q??="; public void decodeEncodedWords_withEncodedWordAndOnlyStartAndTwoSeparatorsOfEncodedWord_shouldDecodeAndAddSuffix() {
; assertInputDecodesToExpected("=?utf-8?Q?abc?= =???", "abc =???");
message = null; }
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message));
body = "=??q?a?="; @Test
expect = "a"; public void decodeEncodedWords_withStartAnd3SeparatorOnly_returnAsText() {
message = null; assertInputDecodesToExpected("=????", "=????");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=??="; @Test
expect = "=??="; public void decodeEncodedWords_withEncodedWordAndOnlyStartAndThreeSeparatorsOfEncodedWord_shouldDecodeAndAddSuffix() {
message = null; assertInputDecodesToExpected("=?utf-8?Q?abc?= =????", "abc =????");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?x?="; @Test
expect = "=?x?="; public void decodeEncodedWords_withSeparatorsOnly_returnAsText() {
message = null; assertInputDecodesToExpected("=????=", "=????=");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?x??="; @Test
expect = "=?x??="; public void decodeEncodedWords_withMissingCharset_returnAsText() {
message = null; assertInputDecodesToExpected("=??q??=", "=??q??=");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?x?q?="; @Test
expect = "=?x?q?="; public void decodeEncodedWords_withTextAndMissingCharset_returnAsText() {
message = null; assertInputDecodesToExpected("=??q?a?=", "a");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?x?q??="; @Test
expect = "=?x?q??="; public void decodeEncodedWords_withNoTextCharsetOrEncoding_returnAsText() {
message = null; assertInputDecodesToExpected("=??=", "=??=");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
body = "=?x?q?X?="; @Test
expect = "X"; public void decodeEncodedWords_with_MissingEncodingAndData_returnAsText() {
message = null; assertInputDecodesToExpected("=?x?=", "=?x?=");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
// invalid base64 string @Test
body = "=?us-ascii?b?abc?="; public void decodeEncodedWords_withMissingEncoding_returnAsText() {
expect = ""; assertInputDecodesToExpected("=?x??=", "=?x??=");
message = null; }
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message));
// broken encoded header @Test
body = "=?us-ascii?q?abc?= =?"; public void decodeEncodedWords_with_incompleteEncodingFormat_returnAsText() {
expect = "abc =?"; assertInputDecodesToExpected("=?x?q?=", "=?x?q?=");
message = null; }
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message));
body = "=?x?= =?"; @Test
expect = "=?x?= =?"; public void decodeEncodedWords_with_unrecognisedEncoding_withEmptyEncodedData_returnAsText() {
message = null; assertInputDecodesToExpected("=?x?q??=", "=?x?q??=");
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); }
//Multi encoded header @Test
body = "=?utf-8?B?5Liq5Lq66YKu566xOkJVRyAjMzAyNDY6OumCruS7tuato+aWh+mZhOS7tuWQ?=\n" + public void decodeEncodedWords_withUnrecognisedEncoding_withEncodedData_return_encoded_data() {
"=?utf-8?B?jeensOecgeeVpeaYvuekuuS8mOWMlg==?="; assertInputDecodesToExpected("=?x?q?X?=", "X");
expect = "个人邮箱:BUG #30246::邮件正文附件<E99984><E4BBB6>称省略显示优化"; }
message = null;
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message));
//Non utf-8 encoding @Test
body = "=?gb2312?B?Obv9t9az6cnu29rHsLqju6rHyLPHSlfN8rrAvsa16qOsuPzT0DIwvNIzOTnU?= " + public void decodeEncodedWords_withInvalidBase64String_returnsEmptyString() {
"=?gb2312?B?qr6r0aG439DHytTLr77Gteq1yMTjwLSjoaOoQUSjqQ?="; assertInputDecodesToExpected("=?us-ascii?b?abc?=", "");
expect = "9积分抽深圳前海华侨城JW万豪酒店更有20家399<EFBFBD><EFBFBD>精选高星试睡酒店等你来AD<EFBFBD>"; }
message = null;
assertEquals(expect, DecoderUtil.decodeEncodedWords(body, message)); @Test
public void decodeEncodedWords_withPartiallyEncoded_returnsBothSections() {
assertInputDecodesToExpected("=?us-ascii?q?abc?= =?", "abc =?");
}
@Test
public void decodeEncodedWords_withPartiallyEncodedAfter_returnsBothSections() {
assertInputDecodesToExpected("def=?us-ascii?q?abc?=", "defabc");
}
@Test
public void decodeEncodedWords_withUnrecognisedCharset_returnsEncodedData() {
assertInputDecodesToExpected("=?x?= =?", "=?x?= =?");
}
@Test
public void decodeEncodedWords_withMultipleEncodedSections_decodesBoth() {
assertInputDecodesToExpected("=?us-ascii?q?abc?= =?us-ascii?q?def?=", "abcdef");
}
@Test
public void decodeEncodedWords_withMultipleEncodedSections_decodesSequentialSectionTogether() {
// Splitting mid-character is RFC2047 non-compliant but seen in practice.
// "=?utf-8?B?b2hhaSDw?=" individually decodes to "ohai <20>"
// "=?utf-8?B?n5Kp==?=" individually decodes to "<EFBFBD><EFBFBD><EFBFBD>"
// (invalid bytes in a UTF-8 sequence are replaced with the replacement character)
assertInputDecodesToExpected("=?utf-8?B?b2hhaSDw?= =?utf-8?B?n5Kp?=", "ohai 💩");
}
@Test
public void decodeEncodedWords_withMultipleEncodedSectionsButCharsetAndEncodingDifferingInCase_decodesSequentialSectionTogether() {
assertInputDecodesToExpected("=?utf-8?B?b2hhaSDw?= =?UTF-8?b?n5Kp?=", "ohai 💩");
}
@Test
public void decodeEncodedWords_withEncodedWordWhitespaceInvalidEncodedWord_shouldOnlyDecodeEncodedWord() {
assertInputDecodesToExpected("=?utf-8?Q?abc?= " + INVALID, "abc " + INVALID);
}
@Test
public void decodeEncodedWords_withInvalidEncodedWordWhitespaceInvalidEncodedWord_shouldReturnInputText() {
String input = INVALID + " " + INVALID;
assertInputDecodesToExpected(input, input);
}
@Test
public void decodeEncodedWords_withEncodedWordNonWhitespaceSeparatorEncodedWord_shouldDecodeBothAndKeepSeparator() {
assertInputDecodesToExpected("=?utf-8?Q?ab?= -- =?utf-8?Q?cd?=", "ab -- cd");
}
@Test
public void decodeEncodedWords_withInvalidEncodedWordWhitespaceEncodedWord_shouldOnlyDecodeEncodedWord() {
assertInputDecodesToExpected(INVALID + " =?utf-8?Q?abc?=", INVALID + " abc");
}
@Test
public void decodeEncodedWords_withEncodedWordFollowedByEncodedWordWithDifferentEncoding_shouldDecodeIndividually() {
assertInputDecodesToExpected("=?utf-8?Q?ab?= =?utf-8?B?Y2Q=?=", "abcd");
}
@Test
public void decodeEncodedWords_withEncodedWordSeparatorEncodedWordWithDifferentEncoding_shouldDecodeIndividuallyAndKeepSeparator() {
assertInputDecodesToExpected("=?utf-8?Q?ab?= / =?utf-8?B?Y2Q=?=", "ab / cd");
}
@Test
public void decodeEncodedWords_withEncodedWordFollowedByEncodedWordWithDifferentCharset_shouldDecodeIndividually() {
assertInputDecodesToExpected("=?us-ascii?Q?oh_no_?= =?utf-8?Q?=F0=9F=92=A9?=", "oh no 💩");
}
@Test
public void decodeEncodedWords_withRFC2047examples_decodesCorrectly() {
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?=)", "(a)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= b)", "(a b)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", "(ab)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", "(ab)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= \n =?ISO-8859-1?Q?b?=)", "(ab)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a_b?=)", "(a b)");
assertInputDecodesToExpected("(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)", "(a b)");
}
private void assertInputDecodesToExpected(String input, String expected) {
String decodedText = DecoderUtil.decodeEncodedWords(input, null);
assertEquals(expected, decodedText);
} }
} }