Index: src/OFConstantString.m ================================================================== --- src/OFConstantString.m +++ src/OFConstantString.m @@ -53,11 +53,12 @@ memset(ivars, 0, sizeof(*ivars)); ivars->cString = (char*)s; ivars->cStringLength = initialized; - switch (of_string_check_utf8(ivars->cString, ivars->cStringLength)) { + switch (of_string_check_utf8(ivars->cString, ivars->cStringLength, + &ivars->length)) { case 1: ivars->isUTF8 = YES; break; case -1: free(ivars); Index: src/OFMutableString.h ================================================================== --- src/OFMutableString.h +++ src/OFMutableString.h @@ -57,34 +57,10 @@ */ - (void)appendCString: (const char*)cString withEncoding: (of_string_encoding_t)encoding length: (size_t)cStringLength; -/** - * \brief Appends a UTF-8 encoded C string to the OFMutableString without - * checking whether it is valid UTF-8. - * - * Only use this if you are 100% sure the string you append is either ASCII or - * UTF-8! - * - * \param cString A UTF-8 encoded C string to append - */ -- (void)appendCStringWithoutUTF8Checking: (const char*)cString; - -/** - * \brief Appends a UTF-8 encoded C string with the specified length to the - * OFMutableString without checking whether it is valid UTF-8. - * - * Only use this if you are 100% sure the string you append is either ASCII or - * UTF-8! - * - * \param cString A UTF-8 encoded C string to append - * \param cStringLength The length of the UTF-8 encoded C string - */ -- (void)appendCStringWithoutUTF8Checking: (const char*)cString - length: (size_t)cStringLength; - /** * \brief Appends another OFString to the OFMutableString. * * \param string An OFString to append */ Index: src/OFMutableString.m ================================================================== --- src/OFMutableString.m +++ src/OFMutableString.m @@ -129,22 +129,29 @@ [self freeMemory: unicodeString]; [self freeMemory: s->cString]; s->cString = newCString; s->cStringLength = newCStringLength; + + /* + * Even though cStringLength can change, length can not, therefore no + * need to change it. + */ } - (void)setToCString: (const char*)newCString { size_t newCStringLength = strlen(newCString); + size_t newLength; if (newCStringLength >= 3 && !memcmp(newCString, "\xEF\xBB\xBF", 3)) { newCString += 3; newCStringLength -= 3; } - switch (of_string_check_utf8(newCString, newCStringLength)) { + switch (of_string_check_utf8(newCString, newCStringLength, + &newLength)) { case 0: s->isUTF8 = NO; break; case 1: s->isUTF8 = YES; @@ -154,24 +161,27 @@ } [self freeMemory: s->cString]; s->cStringLength = newCStringLength; + s->length = newLength; + s->cString = [self allocMemoryWithSize: newCStringLength + 1]; memcpy(s->cString, newCString, newCStringLength + 1); } - (void)appendCString: (const char*)cString { size_t cStringLength = strlen(cString); + size_t length; if (cStringLength >= 3 && !memcmp(cString, "\xEF\xBB\xBF", 3)) { cString += 3; cStringLength -= 3; } - switch (of_string_check_utf8(cString, cStringLength)) { + switch (of_string_check_utf8(cString, cStringLength, &length)) { case 1: s->isUTF8 = YES; break; case -1: @throw [OFInvalidEncodingException newWithClass: isa]; @@ -178,22 +188,26 @@ } s->cString = [self resizeMemory: s->cString toSize: s->cStringLength + cStringLength + 1]; memcpy(s->cString + s->cStringLength, cString, cStringLength + 1); + s->cStringLength += cStringLength; + s->length += length; } - (void)appendCString: (const char*)cString withLength: (size_t)cStringLength { + size_t length; + if (cStringLength >= 3 && !memcmp(cString, "\xEF\xBB\xBF", 3)) { cString += 3; cStringLength -= 3; } - switch (of_string_check_utf8(cString, cStringLength)) { + switch (of_string_check_utf8(cString, cStringLength, &length)) { case 1: s->isUTF8 = YES; break; case -1: @throw [OFInvalidEncodingException newWithClass: isa]; @@ -200,11 +214,14 @@ } s->cString = [self resizeMemory: s->cString toSize: s->cStringLength + cStringLength + 1]; memcpy(s->cString + s->cStringLength, cString, cStringLength); + s->cStringLength += cStringLength; + s->length += length; + s->cString[s->cStringLength] = 0; } - (void)appendCString: (const char*)cString withEncoding: (of_string_encoding_t)encoding @@ -221,39 +238,29 @@ length: cStringLength]]; [pool release]; } } -- (void)appendCStringWithoutUTF8Checking: (const char*)cString +- (void)appendString: (OFString*)string { size_t cStringLength; - cStringLength = strlen(cString); - s->cString = [self resizeMemory: s->cString - toSize: s->cStringLength + cStringLength + 1]; - memcpy(s->cString + s->cStringLength, cString, cStringLength + 1); - s->cStringLength += cStringLength; -} - -- (void)appendCStringWithoutUTF8Checking: (const char*)cString - length: (size_t)cStringLength -{ - s->cString = [self resizeMemory: s->cString - toSize: s->cStringLength + cStringLength + 1]; - memcpy(s->cString + s->cStringLength, cString, cStringLength); - s->cStringLength += cStringLength; - s->cString[s->cStringLength] = 0; -} - -- (void)appendString: (OFString*)string -{ if (string == nil) @throw [OFInvalidArgumentException newWithClass: isa selector: _cmd]; - [self appendCStringWithoutUTF8Checking: [string cString] - length: [string cStringLength]]; + cStringLength = [string cStringLength]; + + s->cString = [self resizeMemory: s->cString + toSize: s->cStringLength + cStringLength + 1]; + memcpy(s->cString + s->cStringLength, string->s->cString, + cStringLength); + + s->cStringLength += cStringLength; + s->length += string->s->length; + + s->cString[s->cStringLength] = 0; if (string->s->isUTF8) s->isUTF8 = YES; } @@ -403,46 +410,50 @@ - (void)insertString: (OFString*)string atIndex: (size_t)index { size_t newCStringLength; + if (index > s->length) + @throw [OFOutOfRangeException newWithClass: isa]; + if (s->isUTF8) index = of_string_index_to_position(s->cString, index, s->cStringLength); - if (index > s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; - newCStringLength = s->cStringLength + [string cStringLength]; s->cString = [self resizeMemory: s->cString toSize: newCStringLength + 1]; - memmove(s->cString + index + [string cStringLength], s->cString + index, - s->cStringLength - index); - memcpy(s->cString + index, [string cString], [string cStringLength]); + memmove(s->cString + index + string->s->cStringLength, + s->cString + index, s->cStringLength - index); + memcpy(s->cString + index, string->s->cString, + string->s->cStringLength); s->cString[newCStringLength] = '\0'; s->cStringLength = newCStringLength; + s->length += string->s->length; } - (void)deleteCharactersFromIndex: (size_t)start toIndex: (size_t)end { + if (start > end) + @throw [OFInvalidArgumentException newWithClass: isa + selector: _cmd]; + + if (end > s->length) + @throw [OFOutOfRangeException newWithClass: isa]; + + s->length -= end - start; + if (s->isUTF8) { start = of_string_index_to_position(s->cString, start, s->cStringLength); end = of_string_index_to_position(s->cString, end, s->cStringLength); } - if (start > end) - @throw [OFInvalidArgumentException newWithClass: isa - selector: _cmd]; - - if (end > s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; - memmove(s->cString + start, s->cString + end, s->cStringLength - end); s->cStringLength -= end - start; s->cString[s->cStringLength] = 0; @try { @@ -462,38 +473,41 @@ - (void)replaceCharactersFromIndex: (size_t)start toIndex: (size_t)end withString: (OFString*)replacement { - size_t newCStringLength; + size_t newCStringLength, newLength; + + if (start > end) + @throw [OFInvalidArgumentException newWithClass: isa + selector: _cmd]; + + if (end > s->length) + @throw [OFOutOfRangeException newWithClass: isa]; + + newLength = s->length - (end - start) + [replacement length]; if (s->isUTF8) { start = of_string_index_to_position(s->cString, start, s->cStringLength); end = of_string_index_to_position(s->cString, end, s->cStringLength); } - if (start > end) - @throw [OFInvalidArgumentException newWithClass: isa - selector: _cmd]; - - if (end > s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; - newCStringLength = s->cStringLength - (end - start) + - [replacement cStringLength]; + replacement->s->cStringLength; s->cString = [self resizeMemory: s->cString toSize: newCStringLength + 1]; memmove(s->cString + end, s->cString + start + - [replacement cStringLength], s->cStringLength - end); - memcpy(s->cString + start, [replacement cString], - [replacement cStringLength]); + replacement->s->cStringLength, s->cStringLength - end); + memcpy(s->cString + start, replacement->s->cString, + replacement->s->cStringLength); s->cString[newCStringLength] = '\0'; s->cStringLength = newCStringLength; + s->length = newLength; } - (void)replaceCharactersInRange: (of_range_t)range withString: (OFString*)replacement { @@ -505,20 +519,21 @@ - (void)replaceOccurrencesOfString: (OFString*)string withString: (OFString*)replacement { const char *cString = [string cString]; const char *replacementCString = [replacement cString]; - size_t cStringLength = [string cStringLength]; - size_t replacementCStringLength = [replacement cStringLength]; - size_t i, last, newCStringLength; + size_t cStringLength = string->s->cStringLength; + size_t replacementCStringLength = replacement->s->cStringLength; + size_t i, last, newCStringLength, newLength; char *newCString; if (cStringLength > s->cStringLength) return; newCString = NULL; newCStringLength = 0; + newLength = s->length; for (i = 0, last = 0; i <= s->cStringLength - cStringLength; i++) { if (memcmp(s->cString + i, cString, cStringLength)) continue; @@ -533,11 +548,15 @@ } memcpy(newCString + newCStringLength, s->cString + last, i - last); memcpy(newCString + newCStringLength + i - last, replacementCString, replacementCStringLength); + newCStringLength += i - last + replacementCStringLength; + newLength = newLength - string->s->length + + replacement->s->length; + i += cStringLength - 1; last = i + 1; } @try { @@ -555,10 +574,11 @@ newCString[newCStringLength] = 0; [self freeMemory: s->cString]; s->cString = newCString; s->cStringLength = newCStringLength; + s->length = newLength; } - (void)deleteLeadingWhitespaces { size_t i; @@ -567,10 +587,12 @@ if (s->cString[i] != ' ' && s->cString[i] != '\t' && s->cString[i] != '\n' && s->cString[i] != '\r') break; s->cStringLength -= i; + s->length -= i; + memmove(s->cString, s->cString + i, s->cStringLength); s->cString[s->cStringLength] = '\0'; @try { s->cString = [self resizeMemory: s->cString @@ -594,10 +616,11 @@ *p = '\0'; d++; } s->cStringLength -= d; + s->length -= d; @try { s->cString = [self resizeMemory: s->cString toSize: s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { @@ -619,17 +642,20 @@ *p = '\0'; d++; } s->cStringLength -= d; + s->length -= d; for (i = 0; i < s->cStringLength; i++) if (s->cString[i] != ' ' && s->cString[i] != '\t' && s->cString[i] != '\n' && s->cString[i] != '\r') break; s->cStringLength -= i; + s->length -= i; + memmove(s->cString, s->cString + i, s->cStringLength); s->cString[s->cStringLength] = '\0'; @try { s->cString = [self resizeMemory: s->cString Index: src/OFString.h ================================================================== --- src/OFString.h +++ src/OFString.h @@ -37,11 +37,11 @@ } of_string_encoding_t; #ifdef __cplusplus extern "C" { #endif -extern int of_string_check_utf8(const char*, size_t); +extern int of_string_check_utf8(const char*, size_t, size_t*); extern size_t of_string_unicode_to_utf8(of_unichar_t, char*); extern size_t of_string_utf8_to_unicode(const char*, size_t, of_unichar_t*); extern size_t of_string_position_to_index(const char*, size_t); extern size_t of_string_index_to_position(const char*, size_t, size_t); extern size_t of_unicode_string_length(const of_unichar_t*); @@ -76,10 +76,11 @@ */ struct of_string_ivars { char *cString; size_t cStringLength; BOOL isUTF8; + size_t length; } *restrict s; /* * Unused in OFString, however, OFConstantString sets this to SIZE_MAX * once it allocated and initialized the struct. */ Index: src/OFString.m ================================================================== --- src/OFString.m +++ src/OFString.m @@ -80,82 +80,88 @@ return OF_ORDERED_SAME; } int -of_string_check_utf8(const char *string, size_t length) +of_string_check_utf8(const char *cString, size_t cStringLength, size_t *length) { - size_t i; + size_t i, tmpLength = cStringLength; int isUTF8 = 0; - madvise((void*)string, length, MADV_SEQUENTIAL); + madvise((void*)cString, cStringLength, MADV_SEQUENTIAL); - for (i = 0; i < length; i++) { + for (i = 0; i < cStringLength; i++) { /* No sign of UTF-8 here */ - if (OF_LIKELY(!(string[i] & 0x80))) + if (OF_LIKELY(!(cString[i] & 0x80))) continue; isUTF8 = 1; /* We're missing a start byte here */ - if (OF_UNLIKELY(!(string[i] & 0x40))) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY(!(cString[i] & 0x40))) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } /* 2 byte sequences for code points 0 - 127 are forbidden */ - if (OF_UNLIKELY((string[i] & 0x7E) == 0x40)) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY((cString[i] & 0x7E) == 0x40)) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } /* We have at minimum a 2 byte character -> check next byte */ - if (OF_UNLIKELY(length <= i + 1 || - (string[i + 1] & 0xC0) != 0x80)) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY(cStringLength <= i + 1 || + (cString[i + 1] & 0xC0) != 0x80)) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } /* Check if we have at minimum a 3 byte character */ - if (OF_LIKELY(!(string[i] & 0x20))) { + if (OF_LIKELY(!(cString[i] & 0x20))) { i++; + tmpLength--; continue; } /* We have at minimum a 3 byte char -> check second next byte */ - if (OF_UNLIKELY(length <= i + 2 || - (string[i + 2] & 0xC0) != 0x80)) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY(cStringLength <= i + 2 || + (cString[i + 2] & 0xC0) != 0x80)) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } /* Check if we have a 4 byte character */ - if (OF_LIKELY(!(string[i] & 0x10))) { + if (OF_LIKELY(!(cString[i] & 0x10))) { i += 2; + tmpLength -= 2; continue; } /* We have a 4 byte character -> check third next byte */ - if (OF_UNLIKELY(length <= i + 3 || - (string[i + 3] & 0xC0) != 0x80)) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY(cStringLength <= i + 3 || + (cString[i + 3] & 0xC0) != 0x80)) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } /* * Just in case, check if there's a 5th character, which is * forbidden by UTF-8 */ - if (OF_UNLIKELY(string[i] & 0x08)) { - madvise((void*)string, length, MADV_NORMAL); + if (OF_UNLIKELY(cString[i] & 0x08)) { + madvise((void*)cString, cStringLength, MADV_NORMAL); return -1; } i += 3; + tmpLength -= 3; } - madvise((void*)string, length, MADV_NORMAL); + madvise((void*)cString, cStringLength, MADV_NORMAL); + + if (length != NULL) + *length = tmpLength; return isUTF8; } size_t @@ -474,11 +480,12 @@ s->cString = [self allocMemoryWithSize: cStringLength + 1]; s->cStringLength = cStringLength; if (encoding == OF_STRING_ENCODING_UTF_8) { - switch (of_string_check_utf8(cString, cStringLength)) { + switch (of_string_check_utf8(cString, cStringLength, + &s->length)) { case 1: s->isUTF8 = YES; break; case -1: @throw [OFInvalidEncodingException @@ -488,10 +495,13 @@ memcpy(s->cString, cString, cStringLength); s->cString[cStringLength] = 0; return self; } + + /* All other encodings we support are single byte encodings */ + s->length = cStringLength; if (encoding == OF_STRING_ENCODING_ISO_8859_1) { for (i = j = 0; i < cStringLength; i++) { char buffer[4]; size_t bytes; @@ -589,15 +599,20 @@ @try { s = [self allocMemoryWithSize: sizeof(*s)]; memset(s, 0, sizeof(*s)); + /* + * We need one call to make sure it's initialized (in case it's + * a constant string). + */ s->cStringLength = [string cStringLength]; s->isUTF8 = string->s->isUTF8; + s->length = string->s->length; s->cString = [self allocMemoryWithSize: s->cStringLength + 1]; - memcpy(s->cString, [string cString], s->cStringLength + 1); + memcpy(s->cString, string->s->cString, s->cStringLength + 1); } @catch (id e) { [self release]; @throw e; } @@ -650,10 +665,11 @@ s = [self allocMemoryWithSize: sizeof(*s)]; memset(s, 0, sizeof(*s)); s->cStringLength = length; s->cString = [self allocMemoryWithSize: (length * 4) + 1]; + s->length = length; for (i = 0; i < length; i++) { char buffer[4]; size_t characterLen = of_string_unicode_to_utf8( (swap ? of_bswap32(string[i]) : string[i]), @@ -756,10 +772,11 @@ s = [self allocMemoryWithSize: sizeof(*s)]; memset(s, 0, sizeof(*s)); s->cStringLength = length; s->cString = [self allocMemoryWithSize: (length * 4) + 1]; + s->length = length; for (i = 0; i < length; i++) { char buffer[4]; of_unichar_t character = (swap ? of_bswap16(string[i]) : string[i]); @@ -783,10 +800,11 @@ character = (((character & 0x3FF) << 10) | (nextCharacter & 0x3FF)) + 0x10000; i++; s->cStringLength--; + s->length--; } characterLen = of_string_unicode_to_utf8( character, buffer); @@ -875,11 +893,11 @@ s->cStringLength = cStringLength; @try { switch (of_string_check_utf8(s->cString, - cStringLength)) { + cStringLength, &s->length)) { case 1: s->isUTF8 = YES; break; case -1: @throw [OFInvalidEncodingException @@ -928,17 +946,18 @@ * First needs to be a call to be sure it is initialized, in * case it's a constant string. */ s->cStringLength = [firstComponent cStringLength]; s->isUTF8 = firstComponent->s->isUTF8; + s->length = firstComponent->s->length; /* Calculate length and see if we need UTF-8 */ va_copy(argumentsCopy, arguments); while ((component = va_arg(argumentsCopy, OFString*)) != nil) { /* First needs to be a call, see above */ - cStringLength = [component cStringLength]; - s->cStringLength += 1 + cStringLength; + s->cStringLength += 1 + [component cStringLength]; + s->length += 1 + component->s->length; if (component->s->isUTF8) s->isUTF8 = YES; } @@ -1110,32 +1129,32 @@ return s->cString; } - (size_t)length { - /* FIXME: Maybe cache this in an ivar? */ - - if (!s->isUTF8) - return s->cStringLength; - - return of_string_position_to_index(s->cString, s->cStringLength); + return s->length; } - (size_t)cStringLength { return s->cStringLength; } - (BOOL)isEqual: (id)object { + OFString *otherString; + if (![object isKindOfClass: [OFString class]]) return NO; - if ([object cStringLength] != s->cStringLength) + otherString = object; + + if ([otherString cStringLength] != s->cStringLength || + otherString->s->length != s->length) return NO; - if (strcmp(s->cString, [object cString])) + if (strcmp(s->cString, otherString->s->cString)) return NO; return YES; } @@ -1303,23 +1322,19 @@ - (of_unichar_t)characterAtIndex: (size_t)index { of_unichar_t character; - if (!s->isUTF8) { - if (index >= s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; + if (index >= s->length) + @throw [OFOutOfRangeException newWithClass: isa]; + if (!s->isUTF8) return s->cString[index]; - } index = of_string_index_to_position(s->cString, index, s->cStringLength); - if (index >= s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; - if (!of_string_utf8_to_unicode(s->cString + index, s->cStringLength - index, &character)) @throw [OFInvalidEncodingException newWithClass: isa]; return character; @@ -1366,11 +1381,11 @@ } - (BOOL)containsString: (OFString*)string { const char *cString = [string cString]; - size_t i, cStringLength = [string cStringLength]; + size_t i, cStringLength = string->s->cStringLength; if (cStringLength == 0) return YES; if (cStringLength > s->cStringLength) @@ -1384,24 +1399,24 @@ } - (OFString*)substringFromIndex: (size_t)start toIndex: (size_t)end { + if (start > end) + @throw [OFInvalidArgumentException newWithClass: isa + selector: _cmd]; + + if (end > s->length) + @throw [OFOutOfRangeException newWithClass: isa]; + if (s->isUTF8) { start = of_string_index_to_position(s->cString, start, s->cStringLength); end = of_string_index_to_position(s->cString, end, s->cStringLength); } - if (start > end) - @throw [OFInvalidArgumentException newWithClass: isa - selector: _cmd]; - - if (end > s->cStringLength) - @throw [OFOutOfRangeException newWithClass: isa]; - return [OFString stringWithCString: s->cString + start length: end - start]; } - (OFString*)substringWithRange: (of_range_t)range @@ -1860,11 +1875,11 @@ { OFObject *object = [[[OFObject alloc] init] autorelease]; of_unichar_t *ret; size_t i, j; - ret = [object allocMemoryForNItems: [self length] + 2 + ret = [object allocMemoryForNItems: s->length + 2 withSize: sizeof(of_unichar_t)]; i = 0; j = 0;