Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -203,10 +203,29 @@ */ + (instancetype)stringWithUTF16String: (const uint16_t*)string length: (size_t)length byteOrder: (of_byte_order_t)byteOrder; +/*! + * @brief Creates a new OFString from a UTF-32 encoded string. + * + * @param string The UTF-32 string + * @return A new autoreleased OFString + */ ++ (instancetype)stringWithUTF32String: (const uint32_t*)string; + +/*! + * @brief Creates a new OFString from a UTF-32 encoded string, assuming the + * specified byte order if no BOM is found. + * + * @param string The UTF-32 string + * @param byteOrder The byte order to assume if there is no BOM + * @return A new autoreleased OFString + */ ++ (instancetype)stringWithUTF32String: (const uint32_t*)string + byteOrder: (of_byte_order_t)byteOrder; + /*! * @brief Creates a new OFString from a format string. * * See printf for the format syntax. As an addition, %@ is available as format * specifier for objects. @@ -402,10 +421,29 @@ */ - initWithUTF16String: (const uint16_t*)string length: (size_t)length byteOrder: (of_byte_order_t)byteOrder; +/*! + * @brief Initializes an already allocated OFString with a UTF-32 string. + * + * @param string The UTF-32 string + * @return An initialized OFString + */ +- initWithUTF32String: (const uint32_t*)string; + +/*! + * @brief Initializes an already allocated OFString with a UTF-32 string, + * assuming the specified byte order if no BOM is found. + * + * @param string The UTF-32 string + * @param byteOrder The byte order to assume if there is no BOM + * @return An initialized OFString + */ +- initWithUTF32String: (const uint32_t*)string + byteOrder: (of_byte_order_t)byteOrder; + /*! * @brief Initializes an already allocated OFString with a format string. * * See printf for the format syntax. As an addition, %@ is available as format * specifier for objects. @@ -902,10 +940,34 @@ * * @return The length of string in UTF-16 characters */ - (size_t)UTF16StringLength; +/*! + * @brief Returns the string in UTF-32 encoding with native byte order. + * + * The result is valid until the autorelease pool is released. If you want to + * use the result outside the scope of the current autorelease pool, you have to + * copy it. + * + * @return The string in UTF-32 encoding with native byte order + */ +- (const of_unichar_t*)UTF32String OF_RETURNS_INNER_POINTER; + +/*! + * @brief Returns the string in UTF-32 encoding with the specified byte order. + * + * The result is valid until the autorelease pool is released. If you want to + * use the result outside the scope of the current autorelease pool, you have to + * copy it. + * + * @param byteOrder The byte order for the UTF-32 encoding + * @return The string in UTF-32 encoding with the specified byte order + */ +- (const of_unichar_t*)UTF32StringWithByteOrder: (of_byte_order_t)byteOrder + OF_RETURNS_INNER_POINTER; + /*! * @brief Writes the string into the specified file using UTF-8 encoding. * * @param path The path of the file to write to */ @@ -939,8 +1001,9 @@ extern "C" { #endif extern size_t of_string_utf8_encode(of_unichar_t, char*); extern size_t of_string_utf8_decode(const char*, size_t, of_unichar_t*); extern size_t of_string_utf16_length(const uint16_t*); +extern size_t of_string_utf32_length(const uint32_t*); #ifdef __cplusplus } #endif Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -150,10 +150,21 @@ while (*string++ != 0) length++; return length; } + +size_t +of_string_utf32_length(const uint32_t *string) +{ + size_t length = 0; + + while (*string++ != 0) + length++; + + return length; +} static OFString* standardize_path(OFArray *components, OFString *currentDirectory, OFString *parentDirectory, OFString *joinString) { @@ -337,10 +348,22 @@ { return (id)[[OFString_UTF8 alloc] initWithUTF16String: string length: length byteOrder: byteOrder]; } + +- initWithUTF32String: (const uint32_t*)string +{ + return (id)[[OFString_UTF8 alloc] initWithUTF32String: string]; +} + +- initWithUTF32String: (const uint32_t*)string + byteOrder: (of_byte_order_t)byteOrder +{ + return (id)[[OFString_UTF8 alloc] initWithUTF32String: string + byteOrder: byteOrder]; +} - initWithFormat: (OFConstantString*)format, ... { id ret; va_list arguments; @@ -536,10 +559,22 @@ { return [[[self alloc] initWithUTF16String: string length: length byteOrder: byteOrder] autorelease]; } + ++ (instancetype)stringWithUTF32String: (const uint32_t*)string +{ + return [[[self alloc] initWithUTF32String: string] autorelease]; +} + ++ (instancetype)stringWithUTF32String: (const uint32_t*)string + byteOrder: (of_byte_order_t)byteOrder +{ + return [[[self alloc] initWithUTF32String: string + byteOrder: byteOrder] autorelease]; +} + (instancetype)stringWithFormat: (OFConstantString*)format, ... { id ret; va_list arguments; @@ -711,10 +746,25 @@ } @catch (id e) { [self release]; @throw e; } } + +- initWithUTF32String: (const uint32_t*)string +{ + return [self initWithCharacters: string + length: of_string_utf32_length(string) + byteOrder: OF_BYTE_ORDER_NATIVE]; +} + +- initWithUTF32String: (const uint32_t*)string + byteOrder: (of_byte_order_t)byteOrder +{ + return [self initWithCharacters: string + length: of_string_utf32_length(string) + byteOrder: byteOrder]; +} - initWithFormat: (OFConstantString*)format, ... { id ret; va_list arguments; @@ -2116,10 +2166,37 @@ if (characters[i] > 0xFFFF) UTF16StringLength++; return UTF16StringLength; } + +- (const of_unichar_t*)UTF32String +{ + return [self UTF32StringWithByteOrder: OF_BYTE_ORDER_NATIVE]; +} + +- (const of_unichar_t*)UTF32StringWithByteOrder: (of_byte_order_t)byteOrder +{ + OFObject *object = [[[OFObject alloc] init] autorelease]; + size_t length = [self length]; + of_unichar_t *ret; + + ret = [object allocMemoryWithSize: sizeof(of_unichar_t) + count: length + 1]; + [self getCharacters: ret + inRange: of_range(0, length)]; + ret[length] = 0; + + if (byteOrder != OF_BYTE_ORDER_NATIVE) { + size_t i; + + for (i = 0; i < length; i++) + ret[i] = OF_BSWAP32(ret[i]); + } + + return ret; +} - (void)writeToFile: (OFString*)path { void *pool = objc_autoreleasePoolPush(); OFFile *file; Index: src/OFString_UTF8.m ================================================================== --- src/OFString_UTF8.m +++ src/OFString_UTF8.m @@ -529,10 +529,15 @@ exceptionWithClass: [self class]]; nextCharacter = (swap ? OF_BSWAP16(string[i + 1]) : string[i + 1]); + + if ((nextCharacter & 0xFC00) != 0xDC00) + @throw [OFInvalidEncodingException + exceptionWithClass: [self class]]; + character = (((character & 0x3FF) << 10) | (nextCharacter & 0x3FF)) + 0x10000; i++; s->cStringLength--; @@ -1270,10 +1275,44 @@ exceptionWithClass: [self class]]; ret[j++] = c; i += cLen; } + + return ret; +} + +- (const of_unichar_t*)UTF32StringWithByteOrder: (of_byte_order_t)byteOrder +{ + OFObject *object = [[[OFObject alloc] init] autorelease]; + of_unichar_t *ret; + size_t i, j; + + ret = [object allocMemoryWithSize: sizeof(of_unichar_t) + count: s->length + 1]; + + i = j = 0; + + while (i < s->cStringLength) { + of_unichar_t c; + size_t cLen; + + cLen = of_string_utf8_decode(s->cString + i, + s->cStringLength - i, &c); + + if (cLen == 0 || c > 0x10FFFF) + @throw [OFInvalidEncodingException + exceptionWithClass: [self class]]; + + if (byteOrder != OF_BYTE_ORDER_NATIVE) + ret[j++] = OF_BSWAP32(c); + else + ret[j++] = c; + + i += cLen; + } + ret[j] = 0; return ret; } #ifdef OF_HAVE_BLOCKS Index: tests/OFStringTests.m ================================================================== --- tests/OFStringTests.m +++ tests/OFStringTests.m @@ -38,15 +38,15 @@ static OFString* whitespace[] = { @" \r \t\n\t \tasd \t \t\t\r\n", @" \t\t \t\t \t \t" }; static of_unichar_t ucstr[] = { - 0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0x1F03A + 0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0x1F03A, 0 }; static of_unichar_t sucstr[] = { 0xFFFE0000, 0x66000000, 0xF6000000, 0xF6000000, 0x62000000, 0xE4000000, - 0x72000000, 0x3AF00100 + 0x72000000, 0x3AF00100, 0 }; static uint16_t utf16str[] = { 0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0xD83C, 0xDC3A, 0 }; static uint16_t sutf16str[] = { @@ -155,25 +155,21 @@ (s[0] = [OFMutableString stringWithUTF8String: "\xEF\xBB\xBF" "foobar" length: 6]) && [s[0] isEqual: @"foo"]) - TEST(@"+[stringWithCharacters:length:]", - (is = [OFString stringWithCharacters: ucstr - length: sizeof(ucstr) / - sizeof(*ucstr)]) && - [is isEqual: @"fööbär🀺"] && - (is = [OFString stringWithCharacters: sucstr - length: sizeof(sucstr) / - sizeof(*sucstr)]) && - [is isEqual: @"fööbär🀺"]) - TEST(@"+[stringWithUTF16String:]", (is = [OFString stringWithUTF16String: utf16str]) && [is isEqual: @"fööbär🀺"] && (is = [OFString stringWithUTF16String: sutf16str]) && [is isEqual: @"fööbär🀺"]) + + TEST(@"+[stringWithUTF32String::]", + (is = [OFString stringWithUTF32String: ucstr]) && + [is isEqual: @"fööbär🀺"] && + (is = [OFString stringWithUTF32String: sucstr]) && + [is isEqual: @"fööbär🀺"]) TEST(@"+[stringWithContentsOfFile:encoding]", (is = [OFString stringWithContentsOfFile: @"testfile.txt" encoding: OF_STRING_ENCODING_ISO_8859_1]) && [is isEqual: @"testäöü"]) @@ -406,24 +402,33 @@ [@"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF" @"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF" hexadecimalValue]) TEST(@"-[characters]", (ua = [@"fööbär🀺" characters]) && - !memcmp(ua, ucstr + 1, sizeof(ucstr) / sizeof(*ucstr))) + !memcmp(ua, ucstr + 1, sizeof(ucstr) - 8)) TEST(@"-[UTF16String]", (u16a = [@"fööbär🀺" UTF16String]) && - !memcmp(u16a, utf16str + 1, sizeof(utf16str) - sizeof(uint16_t))) - - TEST(@"-[UTF16String]", (u16a = [@"fööbär🀺" + !memcmp(u16a, utf16str + 1, of_string_utf16_length(utf16str) * 2) && + (u16a = [@"fööbär🀺" #ifdef OF_BIG_ENDIAN UTF16StringWithByteOrder: OF_BYTE_ORDER_LITTLE_ENDIAN]) && #else UTF16StringWithByteOrder: OF_BYTE_ORDER_BIG_ENDIAN]) && #endif - !memcmp(u16a, sutf16str + 1, sizeof(sutf16str) - sizeof(uint16_t))) + !memcmp(u16a, sutf16str + 1, of_string_utf16_length(sutf16str) * 2)) TEST(@"-[UTF16StringLength]", [@"fööbär🀺" UTF16StringLength] == 8) + + TEST(@"-[UTF32String]", (ua = [@"fööbär🀺" UTF32String]) && + !memcmp(ua, ucstr + 1, of_string_utf32_length(ucstr) * 4) && + (ua = [@"fööbär🀺" +#ifdef OF_BIG_ENDIAN + UTF32StringWithByteOrder: OF_BYTE_ORDER_LITTLE_ENDIAN]) && +#else + UTF32StringWithByteOrder: OF_BYTE_ORDER_BIG_ENDIAN]) && +#endif + !memcmp(ua, sucstr + 1, of_string_utf32_length(sucstr) * 4)) TEST(@"-[MD5Hash]", [[@"asdfoobar" MD5Hash] isEqual: @"184dce2ec49b5422c7cfd8728864db4c"]) TEST(@"-[SHA1Hash]", [[@"asdfoobar" SHA1Hash]