Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -41,10 +41,11 @@ extern size_t of_string_unicode_to_utf8(of_unichar_t, char*); extern size_t of_string_utf8_to_unicode(const char*, size_t, of_unichar_t*); extern size_t of_string_position_to_index(const char*, size_t); extern size_t of_string_index_to_position(const char*, size_t, size_t); extern size_t of_unicode_string_length(const of_unichar_t*); +extern size_t of_utf16_string_length(const uint16_t*); #ifdef __cplusplus } #endif @class OFArray; @@ -132,10 +133,29 @@ * \return A new autoreleased OFString */ + stringWithUnicodeString: (of_unichar_t*)string length: (size_t)length; +/** + * Creates a new OFString from a UTF-16 encoded string. + * + * \param string The UTF-16 string + * \return A new autoreleased OFString + */ ++ stringWithUTF16String: (uint16_t*)string; + +/** + * Creates a new OFString from a UTF-16 encoded string with the specified + * length. + * + * \param string The UTF-16 string + * \param length The length of the unicode string + * \return A new autoreleased OFString + */ ++ stringWithUTF16String: (uint16_t*)string + length: (size_t)length; + /** * Creates a new OFString from a format string. * See printf for the format syntax. * * \param format A string used as format to initialize the OFString @@ -263,10 +283,29 @@ * \return An initialized OFString */ - initWithUnicodeString: (of_unichar_t*)string length: (size_t)length; +/** + * Initializes an already allocated OFString with a UTF-16 string. + * + * \param string The UTF-16 string + * \return An initialized OFString + */ +- initWithUTF16String: (uint16_t*)string; + +/** + * Initializes an already allocated OFString with a UTF-16 string with the + * specified length. + * + * \param string The UTF-16 string + * \param length The length of the UTF-16 string + * \return An initialized OFString + */ +- initWithUTF16String: (uint16_t*)string + length: (size_t)length; + /** * Initializes an already allocated OFString with a format string. * See printf for the format syntax. * * \param format A string used as format to initialize the OFString Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -253,10 +253,21 @@ while (*string_ != 0) string_++; return (size_t)(string_ - string); } + +size_t +of_utf16_string_length(const uint16_t *string) +{ + const uint16_t *string_ = string; + + while (*string_ != 0) + string_++; + + return (size_t)(string_ - string); +} @implementation OFString + string { return [[[self alloc] init] autorelease]; @@ -304,10 +315,22 @@ length: (size_t)length { return [[[self alloc] initWithUnicodeString: string length: length] autorelease]; } + ++ stringWithUTF16String: (uint16_t*)string +{ + return [[[self alloc] initWithUTF16String: string] autorelease]; +} + ++ stringWithUTF16String: (uint16_t*)string + length: (size_t)length +{ + return [[[self alloc] initWithUTF16String: string + length: length] autorelease]; +} + stringWithFormat: (OFString*)format, ... { id ret; va_list arguments; @@ -581,10 +604,120 @@ for (i = 0; i < length_; i++) { size_t characterLen = of_string_unicode_to_utf8( (swap ? of_bswap32(string_[i]) : string_[i]), buffer); + + switch (characterLen) { + case 1: + string[j++] = buffer[0]; + break; + case 2: + isUTF8 = YES; + length++; + + memcpy(string + j, buffer, 2); + j += 2; + + break; + case 3: + isUTF8 = YES; + length += 2; + + memcpy(string + j, buffer, 3); + j += 3; + + break; + case 4: + isUTF8 = YES; + length += 3; + + memcpy(string + j, buffer, 4); + j += 4; + + break; + default: + @throw [OFInvalidEncodingException + newWithClass: isa]; + } + } + + string[j] = '\0'; + + @try { + string = [self resizeMemory: string + toSize: length + 1]; + } @catch (OFOutOfMemoryException *e) { + /* We don't care, as we only tried to make it smaller */ + [e release]; + } + } @catch (id e) { + [self release]; + @throw e; + } + + return self; +} + +- initWithUTF16String: (uint16_t*)string_ +{ + return [self initWithUTF16String: string_ + length: of_utf16_string_length(string_)]; +} + +- initWithUTF16String: (uint16_t*)string_ + length: (size_t)length_ +{ + self = [super init]; + + @try { + char buffer[4]; + size_t i, j = 0; + BOOL swap = NO; + + if (*string_ == 0xFEFF) { + string_++; + length_--; + } + + if (*string_ == 0xFFFE) { + swap = YES; + string_++; + length_--; + } + + length = length_; + string = [self allocMemoryWithSize: (length * 4) + 1]; + + for (i = 0; i < length_; i++) { + of_unichar_t character = + (swap ? of_bswap16(string_[i]) : string_[i]); + size_t characterLen; + + /* Missed the high surrogate */ + if ((character & 0xFC00) == 0xDC00) + @throw [OFInvalidEncodingException + newWithClass: isa]; + + if ((character & 0xFC00) == 0xD800) { + uint16_t nextCharacter; + + if (length <= i + 1) + @throw [OFInvalidEncodingException + newWithClass: isa]; + + nextCharacter = (swap + ? of_bswap16(string_[i + 1]) + : string_[i + 1]); + character = (((character & 0x3FF) << 10) | + (nextCharacter & 0x3FF)) + 0x10000; + + i++; + } + + characterLen = of_string_unicode_to_utf8( + character, buffer); switch (characterLen) { case 1: string[j++] = buffer[0]; break; Index: tests/OFStringTests.m ================================================================== --- tests/OFStringTests.m +++ tests/OFStringTests.m @@ -42,10 +42,17 @@ }; static of_unichar_t sucstr[] = { 0xFFFE0000, 0x66000000, 0xF6000000, 0xF6000000, 0x62000000, 0xE4000000, 0x72000000, 0x3AF00100, 0 }; +static uint16_t utf16str[] = { + 0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0xD83C, 0xDC3A, 0 +}; +static uint16_t sutf16str[] = { + 0xFFFE, 0x6600, 0xF600, 0xF600, 0x6200, 0xE400, 0x7200, 0x3CD8, 0x3ADC, + 0 +}; @interface EntityHandler: OFObject @end @implementation EntityHandler @@ -141,10 +148,16 @@ (s[1] = [OFString stringWithUnicodeString: ucstr]) && [s[1] isEqual: @"fööbär🀺"] && (s[1] = [OFString stringWithUnicodeString: sucstr]) && [s[1] isEqual: @"fööbär🀺"]) + TEST(@"+[stringWithUTF16String:]", + (s[1] = [OFString stringWithUTF16String: utf16str]) && + [s[1] isEqual: @"fööbär🀺"] && + (s[1] = [OFString stringWithUTF16String: sutf16str]) && + [s[1] isEqual: @"fööbär🀺"]) + TEST(@"+[stringWithContentsOfFile:encoding]", (s[1] = [OFString stringWithContentsOfFile: @"testfile.txt" encoding: OF_STRING_ENCODING_ISO_8859_1]) && [s[1] isEqual: @"testäöü"])