Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -584,10 +584,26 @@ */ - (size_t)getCString: (char*)cString maxLength: (size_t)maxLength encoding: (of_string_encoding_t)encoding; +/*! + * @brief Writes the OFString into the specified C string with the specified + * encoding, replacing characters that cannot be represented in the + * specified encoding with a question mark. + * + * @param cString The C string to write into + * @param maxLength The maximum number of bytes to write into the C string, + * including the terminating zero + * @param encoding The encoding to use for writing into the C string + * @return The number of bytes written into the C string, without the + * terminating zero + */ +- (size_t)getLossyCString: (char*)cString + maxLength: (size_t)maxLength + encoding: (of_string_encoding_t)encoding; + /*! * @brief Returns the OFString as a C string in the specified encoding. * * The result is valid until the autorelease pool is released. If you want to * use the result outside the scope of the current autorelease pool, you have to @@ -597,10 +613,25 @@ * @return The OFString as a C string in the specified encoding */ - (const char*)cStringWithEncoding: (of_string_encoding_t)encoding OF_RETURNS_INNER_POINTER; +/*! + * @brief Returns the OFString as a C string in the specified encoding, + * replacing characters that cannot be represented in the specified + * encoding with a question mark. + * + * The result is valid until the autorelease pool is released. If you want to + * use the result outside the scope of the current autorelease pool, you have to + * copy it. + * + * @param encoding The encoding for the C string + * @return The OFString as a C string in the specified encoding + */ +- (const char*)lossyCStringWithEncoding: (of_string_encoding_t)encoding + OF_RETURNS_INNER_POINTER; + /*! * @brief Returns the OFString as a UTF-8 encoded C string. * * The result is valid until the autorelease pool is released. If you want to * use the result outside the scope of the current autorelease pool, you have to Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -63,10 +63,19 @@ * However, the MinGW version __strtod seems to be ok. */ #ifdef _WIN32 # define strtod __strtod #endif + +@interface OFString (OF_PRIVATE_CATEGORY) +- (size_t)OF_getCString: (char*)cString + maxLength: (size_t)maxLength + encoding: (of_string_encoding_t)encoding + lossy: (bool)lossy; +- (const char*)OF_cStringWithEncoding: (of_string_encoding_t)encoding + lossy: (bool)lossy; +@end /* References for static linking */ void _references_to_categories_of_OFString(void) { _OFString_Hashing_reference = 1; @@ -959,13 +968,14 @@ } return self; } -- (size_t)getCString: (char*)cString - maxLength: (size_t)maxLength - encoding: (of_string_encoding_t)encoding +- (size_t)OF_getCString: (char*)cString + maxLength: (size_t)maxLength + encoding: (of_string_encoding_t)encoding + lossy: (bool)lossy { const of_unichar_t *characters = [self characters]; size_t i, length = [self length]; switch (encoding) { @@ -995,11 +1005,17 @@ memcpy(cString + j, buffer, len); j += len; break; default: - @throw [OFInvalidEncodingException exception]; + if (lossy) + cString[j++] = '?'; + else + @throw [OFInvalidEncodingException + exception]; + + break; } } cString[j] = '\0'; @@ -1007,14 +1023,18 @@ case OF_STRING_ENCODING_ASCII: if (length + 1 > maxLength) @throw [OFOutOfRangeException exception]; for (i = 0; i < length; i++) { - if OF_UNLIKELY (characters[i] > 0x80) - @throw [OFInvalidEncodingException exception]; - - cString[i] = (char)characters[i]; + if OF_UNLIKELY (characters[i] > 0x80) { + if (lossy) + cString[i] = '?'; + else + @throw [OFInvalidEncodingException + exception]; + } else + cString[i] = (char)characters[i]; } cString[i] = '\0'; return length; @@ -1021,14 +1041,18 @@ case OF_STRING_ENCODING_ISO_8859_1: if (length + 1 > maxLength) @throw [OFOutOfRangeException exception]; for (i = 0; i < length; i++) { - if OF_UNLIKELY (characters[i] > 0xFF) - @throw [OFInvalidEncodingException exception]; - - cString[i] = (uint8_t)characters[i]; + if OF_UNLIKELY (characters[i] > 0xFF) { + if (lossy) + cString[i] = '?'; + else + @throw [OFInvalidEncodingException + exception]; + } else + cString[i] = (uint8_t)characters[i]; } cString[i] = '\0'; return length; @@ -1046,11 +1070,17 @@ case 0xB4: case 0xB8: case 0xBC: case 0xBD: case 0xBE: - @throw [OFInvalidEncodingException exception]; + if (lossy) + cString[i] = '?'; + else + @throw [OFInvalidEncodingException + exception]; + + break; } if OF_UNLIKELY (c > 0xFF) { switch (c) { case 0x20AC: @@ -1076,12 +1106,18 @@ break; case 0x178: cString[i] = 0xBE; break; default: - @throw [OFInvalidEncodingException - exception]; + if (lossy) + cString[i] = '?'; + else + @throw + [OFInvalidEncodingException + exception]; + + break; } } else cString[i] = (uint8_t)c; } @@ -1093,12 +1129,17 @@ @throw [OFOutOfRangeException exception]; for (i = 0; i < length; i++) { of_unichar_t c = characters[i]; - if OF_UNLIKELY (c >= 0x80 && c <= 0x9F) - @throw [OFInvalidEncodingException exception]; + if OF_UNLIKELY (c >= 0x80 && c <= 0x9F) { + if (lossy) + cString[i] = '?'; + else + @throw [OFInvalidEncodingException + exception]; + } if OF_UNLIKELY (c > 0xFF) { switch (c) { case 0x20AC: cString[i] = 0x80; @@ -1180,12 +1221,18 @@ break; case 0x178: cString[i] = 0x9F; break; default: - @throw [OFInvalidEncodingException - exception]; + if (lossy) + cString[i] = '?'; + else + @throw + [OFInvalidEncodingException + exception]; + + break; } } else cString[i] = (uint8_t)c; } @@ -1196,11 +1243,32 @@ @throw [OFNotImplementedException exceptionWithSelector: _cmd object: self]; } } -- (const char*)cStringWithEncoding: (of_string_encoding_t)encoding +- (size_t)getCString: (char*)cString + maxLength: (size_t)maxLength + encoding: (of_string_encoding_t)encoding +{ + return [self OF_getCString: cString + maxLength: maxLength + encoding: encoding + lossy: false]; +} + +- (size_t)getLossyCString: (char*)cString + maxLength: (size_t)maxLength + encoding: (of_string_encoding_t)encoding +{ + return [self OF_getCString: cString + maxLength: maxLength + encoding: encoding + lossy: true]; +} + +- (const char*)OF_cStringWithEncoding: (of_string_encoding_t)encoding + lossy: (bool)lossy { OFObject *object = [[[OFObject alloc] init] autorelease]; size_t length = [self length]; char *cString; @@ -1208,13 +1276,14 @@ case OF_STRING_ENCODING_UTF_8:; size_t cStringLength; cString = [object allocMemoryWithSize: (length * 4) + 1]; - cStringLength = [self getCString: cString - maxLength: (length * 4) + 1 - encoding: OF_STRING_ENCODING_UTF_8]; + cStringLength = [self OF_getCString: cString + maxLength: (length * 4) + 1 + encoding: OF_STRING_ENCODING_UTF_8 + lossy: lossy]; @try { cString = [object resizeMemory: cString size: cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { @@ -1226,22 +1295,35 @@ case OF_STRING_ENCODING_ISO_8859_1: case OF_STRING_ENCODING_ISO_8859_15: case OF_STRING_ENCODING_WINDOWS_1252: cString = [object allocMemoryWithSize: length + 1]; - [self getCString: cString - maxLength: length + 1 - encoding: encoding]; + [self OF_getCString: cString + maxLength: length + 1 + encoding: encoding + lossy: lossy]; break; default: @throw [OFNotImplementedException exceptionWithSelector: _cmd object: self]; } return cString; } + +- (const char*)cStringWithEncoding: (of_string_encoding_t)encoding +{ + return [self OF_cStringWithEncoding: encoding + lossy: false]; +} + +- (const char*)lossyCStringWithEncoding: (of_string_encoding_t)encoding +{ + return [self OF_cStringWithEncoding: encoding + lossy: true]; +} - (const char*)UTF8String { return [self cStringWithEncoding: OF_STRING_ENCODING_UTF_8]; } Index: tests/OFStringTests.m ================================================================== --- tests/OFStringTests.m +++ tests/OFStringTests.m @@ -217,10 +217,63 @@ "\x94\x95\x96\x97\x98\x99\x9A\x9B" "\x9C\x9E\x9F" encoding: OF_STRING_ENCODING_WINDOWS_1252] isEqual: @"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ"]) + TEST(@"Conversion of Codepage 437 to UTF-8", + [[OFString stringWithCString: "\xB0\xB1\xB2\xDB" + encoding: OF_STRING_ENCODING_CODEPAGE_437] + isEqual: @"░▒▓█"]) + + TEST(@"Conversion of UTF-8 to ASCII #1", + !strcmp([@"This is a test" cStringWithEncoding: + OF_STRING_ENCODING_ASCII], "This is a test")) + + EXPECT_EXCEPTION(@"Conversion of UTF-8 to ASCII #2", + OFInvalidEncodingException, + [@"This is a tést" cStringWithEncoding: OF_STRING_ENCODING_ASCII]) + + TEST(@"Conversion of UTF-8 to ISO-8859-1 #1", + !strcmp([@"This is ä test" cStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_1], "This is \xE4 test")) + + EXPECT_EXCEPTION(@"Conversion of UTF-8 to ISO-8859-1 #2", + OFInvalidEncodingException, [@"This is ä t€st" cStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_1]) + + TEST(@"Conversion of UTF-8 to ISO-8859-15 #1", + !strcmp([@"This is ä t€st" cStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_15], "This is \xE4 t\xA4st")) + + EXPECT_EXCEPTION(@"Conversion of UTF-8 to ISO-8859-15 #2", + OFInvalidEncodingException, [@"This is ä t€st…" cStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_15]) + + TEST(@"Conversion of UTF-8 to Windows-1252 #1", + !strcmp([@"This is ä t€st…" cStringWithEncoding: + OF_STRING_ENCODING_WINDOWS_1252], "This is \xE4 t\x80st\x85")) + + EXPECT_EXCEPTION(@"Conversion of UTF-8 to Windows-1252 #2", + OFInvalidEncodingException, [@"This is ä t€st…‼" + cStringWithEncoding: OF_STRING_ENCODING_WINDOWS_1252]) + + TEST(@"Lossy conversion of UTF-8 to ASCII", + !strcmp([@"This is a tést" lossyCStringWithEncoding: + OF_STRING_ENCODING_ASCII], "This is a t?st")) + + TEST(@"Lossy conversion of UTF-8 to ISO-8859-1", + !strcmp([@"This is ä t€st" lossyCStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_1], "This is \xE4 t?st")) + + TEST(@"Lossy conversion of UTF-8 to ISO-8859-15", + !strcmp([@"This is ä t€st…" lossyCStringWithEncoding: + OF_STRING_ENCODING_ISO_8859_15], "This is \xE4 t\xA4st?")) + + TEST(@"Lossy conversion of UTF-8 to Windows-1252", + !strcmp([@"This is ä t€st…‼" lossyCStringWithEncoding: + OF_STRING_ENCODING_WINDOWS_1252], "This is \xE4 t\x80st\x85?")) + TEST(@"+[stringWithFormat:]", [(s[0] = [OFMutableString stringWithFormat: @"%@:%d", @"test", 123]) isEqual: @"test:123"]) TEST(@"-[appendFormat:]",