Index: src/OFMutableString.m ================================================================== --- src/OFMutableString.m +++ src/OFMutableString.m @@ -35,11 +35,11 @@ apply_table(id self, Class isa, char **string, unsigned int *length, BOOL is_utf8, const of_unichar_t* const table[], const size_t table_size) { of_unichar_t c, tc; of_unichar_t *ustr; - size_t ulen, nlen; + size_t ulen, nlen, clen; size_t i, j, d; char *nstr; if (!is_utf8) { assert(table_size >= 1); @@ -56,17 +56,18 @@ ulen = [self length]; ustr = [self allocMemoryForNItems: [self length] withSize: ulen]; + i = 0; j = 0; nlen = 0; - for (i = 0; i < *length; i++) { - c = of_string_utf8_to_unicode(*string + i, *length - i); + while (i < *length) { + clen = of_string_utf8_to_unicode(*string + i, *length - i, &c); - if (c == OF_INVALID_UNICHAR || c > 0x10FFFF) { + if (clen == 0 || c > 0x10FFFF) { [self freeMemory: ustr]; @throw [OFInvalidEncodingException newWithClass: isa]; } if (c >> 8 < table_size) { @@ -87,21 +88,11 @@ else { [self freeMemory: ustr]; @throw [OFInvalidEncodingException newWithClass: isa]; } - if (c < 0x80); - else if (c < 0x800) - i++; - else if (c < 0x10000) - i += 2; - else if (c < 0x110000) - i += 3; - else { - [self freeMemory: ustr]; - @throw [OFInvalidEncodingException newWithClass: isa]; - } + i += clen; } @try { nstr = [self allocMemoryWithSize: nlen + 1]; } @catch (OFException *e) { Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -13,12 +13,10 @@ #include #import "OFObject.h" #import "OFArray.h" -#define OF_INVALID_UNICHAR UINT32_MAX - typedef uint32_t of_unichar_t; enum of_string_encoding { OF_STRING_ENCODING_UTF_8, OF_STRING_ENCODING_ISO_8859_1, @@ -26,11 +24,11 @@ OF_STRING_ENCODING_WINDOWS_1252 }; extern int of_string_check_utf8(const char*, size_t); extern size_t of_string_unicode_to_utf8(of_unichar_t, char*); -extern of_unichar_t of_string_utf8_to_unicode(const char*, size_t); +extern size_t of_string_utf8_to_unicode(const char*, size_t, of_unichar_t*); extern size_t of_string_position_to_index(const char*, size_t); extern size_t of_string_index_to_position(const char*, size_t, size_t); /** * A class for managing strings. Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -138,42 +138,47 @@ } return 0; } -of_unichar_t -of_string_utf8_to_unicode(const char *buf_, size_t len) +size_t +of_string_utf8_to_unicode(const char *buf_, size_t len, of_unichar_t *ret) { const uint8_t *buf = (const uint8_t*)buf_; - if (!(*buf & 0x80)) - return buf[0]; + if (!(*buf & 0x80)) { + *ret = buf[0]; + return 1; + } if ((*buf & 0xE0) == 0xC0) { if (OF_UNLIKELY(len < 2)) - return OF_INVALID_UNICHAR; + return 0; - return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F); + *ret = ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F); + return 2; } if ((*buf & 0xF0) == 0xE0) { if (OF_UNLIKELY(len < 3)) - return OF_INVALID_UNICHAR; + return 0; - return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | + *ret = ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | (buf[2] & 0x3F); + return 3; } if ((*buf & 0xF8) == 0xF0) { if (OF_UNLIKELY(len < 4)) - return OF_INVALID_UNICHAR; + return 0; - return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | + *ret = ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); + return 4; } - return OF_INVALID_UNICHAR; + return 0; } size_t of_string_position_to_index(const char *str, size_t pos) { @@ -613,12 +618,11 @@ index = of_string_index_to_position(string, index, length); if (index >= length) @throw [OFOutOfRangeException newWithClass: isa]; - if ((c = of_string_utf8_to_unicode(string + index, length - index)) == - OF_INVALID_UNICHAR) + if (!of_string_utf8_to_unicode(string + index, length - index, &c)) @throw [OFInvalidEncodingException newWithClass: isa]; return c; }