Index: src/OFExceptions.h ================================================================== --- src/OFExceptions.h +++ src/OFExceptions.h @@ -132,13 +132,13 @@ */ - (const char*)cString; @end /** - * An OFException indicating that the conversation between two charsets failed. + * An OFException indicating that the encoding is invalid for this object. */ -@interface OFCharsetConversionFailedException: OFException {} +@interface OFInvalidEncodingException: OFException {} /** * \return An error message for the exception as a C String */ - (const char*)cString; @end Index: src/OFExceptions.m ================================================================== --- src/OFExceptions.m +++ src/OFExceptions.m @@ -139,17 +139,17 @@ return string; } @end -@implementation OFCharsetConversionFailedException +@implementation OFInvalidEncodingException - (const char*)cString { if (string != NULL) return string; - asprintf(&string, "Charset conversion failed in object of classs %s!", + asprintf(&string, "The encoding is invalid for object of classs %s!", [object name]); return string; } @end Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -16,10 +16,11 @@ */ @interface OFString: OFObject { char *string; size_t length; + BOOL is_utf8; } /** * Creates a new OFString. * Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -15,10 +15,70 @@ #import #import #import "OFString.h" #import "OFExceptions.h" +#import "OFMacros.h" + +static OF_INLINE int +check_utf8(const char *str, size_t len) +{ + size_t i; + BOOL utf8; + + utf8 = NO; + + for (i = 0; i < len; i++) { + /* No sign of UTF-8 here */ + if (OF_LIKELY(~str[i] & 0x80)) + continue; + + utf8 = YES; + + /* We're missing a start byte here */ + if (OF_UNLIKELY(~str[i] & 0x40)) + return -1; + + /* We have at minimum a 2 byte character -> check next byte */ + if (OF_UNLIKELY(len < i + 1 || ~str[i + 1] & 0x80 || + str[i + 1] & 0x40)) + return -1; + + /* Check if we have at minimum a 3 byte character */ + if (OF_LIKELY(~str[i] & 0x20)) { + i++; + continue; + } + + /* We have at minimum a 3 byte char -> check second next byte */ + if (OF_UNLIKELY(len < i + 2 || ~str[i + 2] & 0x80 || + str[i + 2] & 0x40)) + return -1; + + /* Check if we have a 4 byte character */ + if (OF_LIKELY(~str[i] & 0x10)) { + i += 2; + continue; + } + + /* We have a 4 byte character -> check third next byte */ + if (OF_UNLIKELY(len < i + 3 || ~str[i + 3] & 0x80 || + str[i + 3] & 0x40)) + return -1; + + /* + * Just in case, check if there's a 5th character, which is + * forbidden by UTF-8 + */ + if (OF_UNLIKELY(str[i] & 0x08)) + return -1; + + i += 3; + } + + return (utf8 ? 1 : 0); +} @implementation OFString + new { return [[self alloc] init]; @@ -32,23 +92,32 @@ - init { if ((self = [super init])) { length = 0; string = NULL; + is_utf8 = NO; } return self; } - initFromCString: (const char*)str { if ((self = [super init])) { - if (str == NULL) { - length = 0; - string = NULL; - } else { + if (str != NULL) { length = strlen(str); + + switch (check_utf8(str, length)) { + case 1: + is_utf8 = YES; + break; + case -1: + [super free]; + @throw [OFInvalidEncodingException + newWithObject: self]; + } + string = [self getMemWithSize: length + 1]; memcpy(string, str, length + 1); } } @@ -93,12 +162,20 @@ if (string == NULL) return [self setTo: [OFString newFromCString: str]]; strlength = strlen(str); - newlen = length + strlength; + + switch (check_utf8(str, strlength)) { + case 1: + is_utf8 = YES; + break; + case -1: + @throw [OFInvalidEncodingException newWithObject: self]; + } + newlen = length + strlength; newstr = [self resizeMem: string toSize: newlen + 1]; memcpy(newstr + length, str, strlength + 1); @@ -110,23 +187,89 @@ - reverse { size_t i, j, len = length / 2; + /* We reverse all bytes and restore UTF-8 later, if necessary */ for (i = 0, j = length - 1; i < len; i++, j--) { string[i] ^= string[j]; string[j] ^= string[i]; string[i] ^= string[j]; } + + if (!is_utf8) + return self; + + for (i = 0; i < length; i++) { + /* ASCII */ + if (OF_LIKELY(~string[i] & 0x80)) + continue; + + /* A start byte can't happen first as we reversed everything */ + if (OF_UNLIKELY(string[i] & 0x40)) + @throw [OFInvalidEncodingException newWithObject: self]; + + /* Next byte must not be ASCII */ + if (OF_UNLIKELY(length < i + 1 || ~string[i + 1] & 0x80)) + @throw [OFInvalidEncodingException newWithObject: self]; + + /* Next byte is the start byte */ + if (OF_LIKELY(string[i + 1] & 0x40)) { + string[i] ^= string[i + 1]; + string[i + 1] ^= string[i]; + string[i] ^= string[i + 1]; + + i++; + continue; + } + + /* Second next byte must not be ASCII */ + if (OF_UNLIKELY(length < i + 2 || ~string[i + 2] & 0x80)) + @throw [OFInvalidEncodingException newWithObject: self]; + + /* Second next byte is the start byte */ + if (OF_LIKELY(string[i + 2] & 0x40)) { + string[i] ^= string[i + 2]; + string[i + 2] ^= string[i]; + string[i] ^= string[i + 2]; + + i += 2; + continue; + } + + /* Third next byte must not be ASCII */ + if (OF_UNLIKELY(length < i + 3 || ~string[i + 3] & 0x80)) + @throw [OFInvalidEncodingException newWithObject: self]; + + /* Third next byte is the start byte */ + if (OF_LIKELY(string[i + 3] & 0x40)) { + string[i] ^= string[i + 3]; + string[i + 3] ^= string[i]; + string[i] ^= string[i + 3]; + + string[i + 1] ^= string[i + 2]; + string[i + 2] ^= string[i + 1]; + string[i + 1] ^= string[i + 2]; + + i += 3; + continue; + } + + /* UTF-8 does not allow more than 4 bytes per character */ + @throw [OFInvalidEncodingException newWithObject: self]; + } return self; } - upper { size_t i = length; + if (is_utf8) + @throw [OFInvalidEncodingException newWithObject: self]; + while (i--) string[i] = toupper(string[i]); return self; } @@ -133,11 +276,14 @@ - lower { size_t i = length; + if (is_utf8) + @throw [OFInvalidEncodingException newWithObject: self]; + while (i--) string[i] = tolower(string[i]); return self; } @end Index: tests/OFString/OFString.m ================================================================== --- tests/OFString/OFString.m +++ tests/OFString/OFString.m @@ -13,10 +13,11 @@ #import #import #import "OFString.h" +#import "OFExceptions.h" int main() { OFString *s1 = [OFString newFromCString: "test"]; @@ -81,8 +82,35 @@ /* Also clears all the memory of the returned C strings */ [s1 free]; [s2 free]; [s3 free]; [s4 free]; + + /* UTF-8 tests */ + @try { + s1 = [OFString newFromCString: "\xE0\x80"]; + + puts("First invalid UTF-8 not detected!"); + return 1; + } @catch (OFInvalidEncodingException *e) { + puts("First invalid UTF-8 successfully detected!"); + } + + @try { + s1 = [OFString newFromCString: "\xF0\x80\x80\xC0"]; + + puts("Second UTF-8 not detected!"); + return 1; + } @catch (OFInvalidEncodingException *e) { + puts("Second UTF-8 successfully detected!"); + } + + s1 = [OFString newFromCString: "äöü€𝄞"]; + if (!strcmp([[s1 reverse] cString], "𝄞€üöä")) + puts("Reversed UTF-8 string is expected string! GOOD!"); + else { + puts("Reversed UTF-8 string is NOT expected string!"); + return 1; + } return 0; }