Make expand_tabs() UTF-8 aware.

Previously, expand_tabs() counted every *byte* rather than every *character*
to determine the proper tab stops.  This change fixes that by skipping over
UTF-8 continuation bytes (bytes of the form 10xxxxxx), which are the only
types of bytes in valid UTF-8 that do not start a character.

This code does not skip over combining characters.
This commit is contained in:
Mark Lodato 2014-09-20 02:26:20 -04:00
parent ebb1a34d0f
commit 23a8c1933b

View file

@ -2662,13 +2662,22 @@ is_ref(const uint8_t *data, size_t beg, size_t end, size_t *last, struct link_re
static void expand_tabs(hoedown_buffer *ob, const uint8_t *line, size_t size)
{
/* This code makes two assumptions:
* - Input is valid UTF-8. (Any byte with top two bits 10 is skipped,
* whether or not it is a valid UTF-8 continuation byte.)
* - Input contains no combining characters. (Combining characters
* should be skipped but are not.)
*/
size_t i = 0, tab = 0;
while (i < size) {
size_t org = i;
while (i < size && line[i] != '\t') {
i++; tab++;
i++;
/* ignore UTF-8 continuation bytes */
if ((line[i] & 0xc0) != 0x80)
tab++;
}
if (i > org)