Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -13,10 +13,12 @@ #include #import "OFObject.h" #import "OFArray.h" +#define OF_INVALID_UNICHAR UINT32_MAX + typedef uint32_t of_unichar_t; enum of_string_encoding { OF_STRING_ENCODING_UTF_8, OF_STRING_ENCODING_ISO_8859_1, @@ -24,10 +26,11 @@ OF_STRING_ENCODING_WINDOWS_1252 }; extern int of_string_check_utf8(const char*, size_t); extern size_t of_string_unicode_to_utf8(of_unichar_t, char*); +extern of_unichar_t of_string_utf8_to_unicode(const char*, size_t); extern size_t of_string_position_to_index(const char*, size_t); extern size_t of_string_index_to_position(const char*, size_t, size_t); /** * A class for managing strings. @@ -210,10 +213,16 @@ * \return An integer which is the result of the comparison, see for example * strcmp */ - (int)compare: (id)obj; +/** + * \param index The index of the Unicode character to return + * \return The Unicode character at the specified index + */ +- (of_unichar_t)characterAtIndex: (size_t)index; + /** * \param str The string to search * \return The index of the first occurrence of the string or SIZE_MAX if it * wasn't found */ Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -137,10 +137,42 @@ return 4; } return 0; } + +of_unichar_t +of_string_utf8_to_unicode(const char *buf_, size_t len) +{ + const uint8_t *buf = (const uint8_t*)buf_; + + if (*buf < 0x80) + return buf[0]; + + switch (*buf & 0xF0) { + case 0xC0: + case 0xD0: + if (OF_UNLIKELY(len < 2)) + return OF_INVALID_UNICHAR; + + return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F); + case 0xE0: + if (OF_UNLIKELY(len < 3)) + return OF_INVALID_UNICHAR; + + return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) | + (buf[2] & 0x3F); + case 0xF0: + if (OF_UNLIKELY(len < 4)) + return OF_INVALID_UNICHAR; + + return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) | + ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F); + } + + return OF_INVALID_UNICHAR; +} size_t of_string_position_to_index(const char *str, size_t pos) { size_t i, idx = pos; @@ -555,10 +587,26 @@ OF_HASH_ADD(hash, string[i]); OF_HASH_FINALIZE(hash); return hash; } + +- (of_unichar_t)characterAtIndex: (size_t)index +{ + of_unichar_t c; + + index = of_string_index_to_position(string, index, length); + + if (index >= length) + @throw [OFOutOfRangeException newWithClass: isa]; + + if ((c = of_string_utf8_to_unicode(string + index, length - index)) == + OF_INVALID_UNICHAR) + @throw [OFInvalidEncodingException newWithClass: isa]; + + return c; +} - (size_t)indexOfFirstOccurrenceOfString: (OFString*)str { const char *str_c = [str cString]; size_t str_len = [str cStringLength]; Index: tests/string.m ================================================================== --- tests/string.m +++ tests/string.m @@ -64,10 +64,19 @@ [[s[0] appendString: s[1]] isEqual: @"täs€1𝄞3"]) TEST(@"-[length]", [s[0] length] == 7) TEST(@"-[cStringLength]", [s[0] cStringLength] == 13) TEST(@"-[hash]", [s[0] hash] == 0x8AC1EEF6) + + TEST(@"-[characterAtIndex:]", [s[0] characterAtIndex: 0] == 't' && + [s[0] characterAtIndex: 1] == 0xE4 && + [s[0] characterAtIndex: 3] == 0x20AC && + [s[0] characterAtIndex: 5] == 0x1D11E) + + EXPECT_EXCEPTION(@"Detect out of range in -[characterAtIndex:]", + OFOutOfRangeException, [s[0] characterAtIndex: 7]) + TEST(@"-[reverse]", [[s[0] reverse] isEqual: @"3𝄞1€sät"]) s[0] = [OFMutableString stringWithString: @"321tset"]; TEST(@"-[upper]", [[s[0] upper] isEqual: @"321TSET"]) TEST(@"-[lower]", [[s[0] lower] isEqual: @"321tset"])