Artifact 9492a9e3d81d1ea24988020118c2be5f4453f59f688dde68ab23c6cfb52e6d22:
- File
src/OFMutableString_UTF8.m
— part of check-in
[4d892e0db1]
at
2013-07-04 20:49:52
on branch trunk
— Fix -[replaceCharactersInRange:withString:].
The resizing is now done before the memmove() if the new string is
bigger and after the memmove() if the new string is shorter. The added
comment explains why this is necessary.This also adds a test for -[replaceCharactersInRange:withString:] that
makes the string bigger and another one that makes it smaller again. (user: js, size: 20151) [annotate] [blame] [check-ins using]
/* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 * Jonathan Schleifer <js@webkeks.org> * * All rights reserved. * * This file is part of ObjFW. It may be distributed under the terms of the * Q Public License 1.0, which can be found in the file LICENSE.QPL included in * the packaging of this file. * * Alternatively, it may be distributed under the terms of the GNU General * Public License, either version 2 or 3, which can be found in the file * LICENSE.GPLv2 or LICENSE.GPLv3 respectively included in the packaging of this * file. */ #include "config.h" #include <stdarg.h> #include <stdlib.h> #include <string.h> #include <assert.h> #import "OFString.h" #import "OFString_UTF8.h" #import "OFMutableString_UTF8.h" #import "OFInvalidArgumentException.h" #import "OFInvalidEncodingException.h" #import "OFInvalidFormatException.h" #import "OFOutOfMemoryException.h" #import "OFOutOfRangeException.h" #import "autorelease.h" #import "macros.h" #import "of_asprintf.h" #import "unicode.h" @implementation OFMutableString_UTF8 + (void)initialize { if (self == [OFMutableString_UTF8 class]) [self inheritMethodsFromClass: [OFString_UTF8 class]]; } - initWithUTF8StringNoCopy: (char*)UTF8String freeWhenDone: (bool)freeWhenDone { @try { self = [self initWithUTF8String: UTF8String]; } @finally { if (freeWhenDone) free(UTF8String); } return self; } - (void)OF_convertWithWordStartTable: (const of_unichar_t *const[])startTable wordMiddleTable: (const of_unichar_t *const[])middleTable wordStartTableSize: (size_t)startTableSize wordMiddleTableSize: (size_t)middleTableSize { of_unichar_t *unicodeString; size_t unicodeLen, newCStringLength; size_t i, j; char *newCString; bool isStart = true; if (!_s->isUTF8) { uint8_t t; const of_unichar_t *const *table; assert(startTableSize >= 1 && middleTableSize >= 1); _s->hashed = false; for (i = 0; i < _s->cStringLength; i++) { if (isStart) table = startTable; else table = middleTable; switch (_s->cString[i]) { case ' ': case '\t': case '\n': case '\r': isStart = true; break; default: isStart = false; break; } if ((t = table[0][(uint8_t)_s->cString[i]]) != 0) _s->cString[i] = t; } return; } unicodeLen = [self length]; unicodeString = [self allocMemoryWithSize: sizeof(of_unichar_t) count: unicodeLen]; i = j = 0; newCStringLength = 0; while (i < _s->cStringLength) { const of_unichar_t *const *table; size_t tableSize; of_unichar_t c; size_t cLen; if (isStart) { table = startTable; tableSize = middleTableSize; } else { table = middleTable; tableSize = middleTableSize; } cLen = of_string_utf8_decode(_s->cString + i, _s->cStringLength - i, &c); if (cLen == 0 || c > 0x10FFFF) { [self freeMemory: unicodeString]; @throw [OFInvalidEncodingException exception]; } switch (c) { case ' ': case '\t': case '\n': case '\r': isStart = true; break; default: isStart = false; break; } if (c >> 8 < tableSize) { of_unichar_t tc = table[c >> 8][c & 0xFF]; if (tc) c = tc; } unicodeString[j++] = c; if (c < 0x80) newCStringLength++; else if (c < 0x800) newCStringLength += 2; else if (c < 0x10000) newCStringLength += 3; else if (c < 0x110000) newCStringLength += 4; else { [self freeMemory: unicodeString]; @throw [OFInvalidEncodingException exception]; } i += cLen; } @try { newCString = [self allocMemoryWithSize: newCStringLength + 1]; } @catch (id e) { [self freeMemory: unicodeString]; @throw e; } j = 0; for (i = 0; i < unicodeLen; i++) { size_t d; if ((d = of_string_utf8_encode(unicodeString[i], newCString + j)) == 0) { [self freeMemory: unicodeString]; [self freeMemory: newCString]; @throw [OFInvalidEncodingException exception]; } j += d; } assert(j == newCStringLength); newCString[j] = 0; [self freeMemory: unicodeString]; [self freeMemory: _s->cString]; _s->hashed = false; _s->cString = newCString; _s->cStringLength = newCStringLength; /* * Even though cStringLength can change, length cannot, therefore no * need to change it. */ } - (void)setCharacter: (of_unichar_t)character atIndex: (size_t)index { char buffer[4]; of_unichar_t c; size_t lenNew, lenOld; if (_s->isUTF8) index = of_string_utf8_get_position(_s->cString, index, _s->cStringLength); if (index > _s->cStringLength) @throw [OFOutOfRangeException exception]; /* Shortcut if old and new character both are ASCII */ if (!(character & 0x80) && !(_s->cString[index] & 0x80)) { _s->hashed = false; _s->cString[index] = character; return; } if ((lenNew = of_string_utf8_encode(character, buffer)) == 0) @throw [OFInvalidEncodingException exception]; if ((lenOld = of_string_utf8_decode(_s->cString + index, _s->cStringLength - index, &c)) == 0) @throw [OFInvalidEncodingException exception]; _s->hashed = false; if (lenNew == lenOld) memcpy(_s->cString + index, buffer, lenNew); else if (lenNew > lenOld) { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength - lenOld + lenNew + 1]; memmove(_s->cString + index + lenNew, _s->cString + index + lenOld, _s->cStringLength - index - lenOld); memcpy(_s->cString + index, buffer, lenNew); _s->cStringLength -= lenOld; _s->cStringLength += lenNew; _s->cString[_s->cStringLength] = '\0'; if (character & 0x80) _s->isUTF8 = true; } else if (lenNew < lenOld) { memmove(_s->cString + index + lenNew, _s->cString + index + lenOld, _s->cStringLength - index - lenOld); memcpy(_s->cString + index, buffer, lenNew); _s->cStringLength -= lenOld; _s->cStringLength += lenNew; _s->cString[_s->cStringLength] = '\0'; @try { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { /* We don't really care, as we only made it smaller */ } } else assert(0); } - (void)appendUTF8String: (const char*)UTF8String { size_t UTF8StringLength = strlen(UTF8String); size_t length; if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) { UTF8String += 3; UTF8StringLength -= 3; } switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) { case 1: _s->isUTF8 = true; break; case -1: @throw [OFInvalidEncodingException exception]; } _s->hashed = false; _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + UTF8StringLength + 1]; memcpy(_s->cString + _s->cStringLength, UTF8String, UTF8StringLength + 1); _s->cStringLength += UTF8StringLength; _s->length += length; } - (void)appendUTF8String: (const char*)UTF8String length: (size_t)UTF8StringLength { size_t length; if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) { UTF8String += 3; UTF8StringLength -= 3; } switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) { case 1: _s->isUTF8 = true; break; case -1: @throw [OFInvalidEncodingException exception]; } _s->hashed = false; _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + UTF8StringLength + 1]; memcpy(_s->cString + _s->cStringLength, UTF8String, UTF8StringLength); _s->cStringLength += UTF8StringLength; _s->length += length; _s->cString[_s->cStringLength] = 0; } - (void)appendCString: (const char*)cString encoding: (of_string_encoding_t)encoding { return [self appendCString: cString encoding: encoding length: strlen(cString)]; } - (void)appendCString: (const char*)cString encoding: (of_string_encoding_t)encoding length: (size_t)cStringLength { if (encoding == OF_STRING_ENCODING_UTF_8) [self appendUTF8String: cString length: cStringLength]; else { void *pool = objc_autoreleasePoolPush(); [self appendString: [OFString stringWithCString: cString encoding: encoding length: cStringLength]]; objc_autoreleasePoolPop(pool); } } - (void)appendString: (OFString*)string { size_t UTF8StringLength; if (string == nil) @throw [OFInvalidArgumentException exception]; UTF8StringLength = [string UTF8StringLength]; _s->hashed = false; _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + UTF8StringLength + 1]; memcpy(_s->cString + _s->cStringLength, [string UTF8String], UTF8StringLength); _s->cStringLength += UTF8StringLength; _s->length += [string length]; _s->cString[_s->cStringLength] = 0; if ([string isKindOfClass: [OFString_UTF8 class]] || [string isKindOfClass: [OFMutableString_UTF8 class]]) { if (((OFString_UTF8*)string)->_s->isUTF8) _s->isUTF8 = true; } else _s->isUTF8 = true; } - (void)appendCharacters: (of_unichar_t*)characters length: (size_t)length { char *tmp; tmp = [self allocMemoryWithSize: (length * 4) + 1]; @try { size_t i, j = 0; bool isUTF8 = false; for (i = 0; i < length; i++) { char buffer[4]; switch (of_string_utf8_encode(characters[i], buffer)) { case 1: tmp[j++] = buffer[0]; break; case 2: isUTF8 = true; memcpy(tmp + j, buffer, 2); j += 2; break; case 3: isUTF8 = true; memcpy(tmp + j, buffer, 3); j += 3; break; case 4: isUTF8 = true; memcpy(tmp + j, buffer, 4); j += 4; break; default: @throw [OFInvalidEncodingException exception]; } } tmp[j] = '\0'; _s->hashed = false; _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + j + 1]; memcpy(_s->cString + _s->cStringLength, tmp, j + 1); _s->cStringLength += j; _s->length += length; if (isUTF8) _s->isUTF8 = true; } @finally { [self freeMemory: tmp]; } } - (void)appendFormat: (OFConstantString*)format arguments: (va_list)arguments { char *UTF8String; int UTF8StringLength; if (format == nil) @throw [OFInvalidArgumentException exception]; if ((UTF8StringLength = of_vasprintf(&UTF8String, [format UTF8String], arguments)) == -1) @throw [OFInvalidFormatException exception]; @try { [self appendUTF8String: UTF8String length: UTF8StringLength]; } @finally { free(UTF8String); } } - (void)reverse { size_t i, j; _s->hashed = false; /* We reverse all bytes and restore UTF-8 later, if necessary */ for (i = 0, j = _s->cStringLength - 1; i < _s->cStringLength / 2; i++, j--) { _s->cString[i] ^= _s->cString[j]; _s->cString[j] ^= _s->cString[i]; _s->cString[i] ^= _s->cString[j]; } if (!_s->isUTF8) return; for (i = 0; i < _s->cStringLength; i++) { /* ASCII */ if OF_LIKELY (!(_s->cString[i] & 0x80)) continue; /* A start byte can't happen first as we reversed everything */ if OF_UNLIKELY (_s->cString[i] & 0x40) @throw [OFInvalidEncodingException exception]; /* Next byte must not be ASCII */ if OF_UNLIKELY (_s->cStringLength < i + 1 || !(_s->cString[i + 1] & 0x80)) @throw [OFInvalidEncodingException exception]; /* Next byte is the start byte */ if OF_LIKELY (_s->cString[i + 1] & 0x40) { _s->cString[i] ^= _s->cString[i + 1]; _s->cString[i + 1] ^= _s->cString[i]; _s->cString[i] ^= _s->cString[i + 1]; i++; continue; } /* Second next byte must not be ASCII */ if OF_UNLIKELY (_s->cStringLength < i + 2 || !(_s->cString[i + 2] & 0x80)) @throw [OFInvalidEncodingException exception]; /* Second next byte is the start byte */ if OF_LIKELY (_s->cString[i + 2] & 0x40) { _s->cString[i] ^= _s->cString[i + 2]; _s->cString[i + 2] ^= _s->cString[i]; _s->cString[i] ^= _s->cString[i + 2]; i += 2; continue; } /* Third next byte must not be ASCII */ if OF_UNLIKELY (_s->cStringLength < i + 3 || !(_s->cString[i + 3] & 0x80)) @throw [OFInvalidEncodingException exception]; /* Third next byte is the start byte */ if OF_LIKELY (_s->cString[i + 3] & 0x40) { _s->cString[i] ^= _s->cString[i + 3]; _s->cString[i + 3] ^= _s->cString[i]; _s->cString[i] ^= _s->cString[i + 3]; _s->cString[i + 1] ^= _s->cString[i + 2]; _s->cString[i + 2] ^= _s->cString[i + 1]; _s->cString[i + 1] ^= _s->cString[i + 2]; i += 3; continue; } /* UTF-8 does not allow more than 4 bytes per character */ @throw [OFInvalidEncodingException exception]; } } - (void)insertString: (OFString*)string atIndex: (size_t)index { size_t newCStringLength; if (index > _s->length) @throw [OFOutOfRangeException exception]; if (_s->isUTF8) index = of_string_utf8_get_position(_s->cString, index, _s->cStringLength); newCStringLength = _s->cStringLength + [string UTF8StringLength]; _s->hashed = false; _s->cString = [self resizeMemory: _s->cString size: newCStringLength + 1]; memmove(_s->cString + index + [string UTF8StringLength], _s->cString + index, _s->cStringLength - index); memcpy(_s->cString + index, [string UTF8String], [string UTF8StringLength]); _s->cString[newCStringLength] = '\0'; _s->cStringLength = newCStringLength; _s->length += [string length]; if ([string isKindOfClass: [OFString_UTF8 class]] || [string isKindOfClass: [OFMutableString_UTF8 class]]) { if (((OFString_UTF8*)string)->_s->isUTF8) _s->isUTF8 = true; } else _s->isUTF8 = true; } - (void)deleteCharactersInRange: (of_range_t)range { size_t start = range.location; size_t end = range.location + range.length; if (range.length > SIZE_MAX - range.location || end > _s->length) @throw [OFOutOfRangeException exception]; if (_s->isUTF8) { start = of_string_utf8_get_position(_s->cString, start, _s->cStringLength); end = of_string_utf8_get_position(_s->cString, end, _s->cStringLength); } memmove(_s->cString + start, _s->cString + end, _s->cStringLength - end); _s->hashed = false; _s->length -= range.length; _s->cStringLength -= end - start; _s->cString[_s->cStringLength] = 0; @try { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { /* We don't really care, as we only made it smaller */ } } - (void)replaceCharactersInRange: (of_range_t)range withString: (OFString*)replacement { size_t start = range.location; size_t end = range.location + range.length; size_t newCStringLength, newLength; if (range.length > SIZE_MAX - range.location || end > _s->length) @throw [OFOutOfRangeException exception]; newLength = _s->length - range.length + [replacement length]; if (_s->isUTF8) { start = of_string_utf8_get_position(_s->cString, start, _s->cStringLength); end = of_string_utf8_get_position(_s->cString, end, _s->cStringLength); } newCStringLength = _s->cStringLength - (end - start) + [replacement UTF8StringLength]; _s->hashed = false; /* * If the new string is bigger, we need to resize it first so we can * memmove() the rest of the string to the end. * * We must not resize the string if the new string is smaller, because * then we can't memove() the rest of the string forward as the rest is * lost due to the resize! */ if (newCStringLength > _s->cStringLength) _s->cString = [self resizeMemory: _s->cString size: newCStringLength + 1]; memmove(_s->cString + start + [replacement UTF8StringLength], _s->cString + end, _s->cStringLength - end); memcpy(_s->cString + start, [replacement UTF8String], [replacement UTF8StringLength]); _s->cString[newCStringLength] = '\0'; /* * If the new string is smaller, we can safely resize it now as we're * done with memmove(). */ if (newCStringLength < _s->cStringLength) _s->cString = [self resizeMemory: _s->cString size: newCStringLength + 1]; _s->cStringLength = newCStringLength; _s->length = newLength; } - (void)replaceOccurrencesOfString: (OFString*)string withString: (OFString*)replacement options: (int)options range: (of_range_t)range { const char *searchString = [string UTF8String]; const char *replacementString = [replacement UTF8String]; size_t searchLength = [string UTF8StringLength]; size_t replacementLength = [replacement UTF8StringLength]; size_t i, last, newCStringLength, newLength; char *newCString; if (range.length > SIZE_MAX - range.location || range.location + range.length > [self length]) @throw [OFOutOfRangeException exception]; if (_s->isUTF8) { range.location = of_string_utf8_get_position(_s->cString, range.location, _s->cStringLength); range.length = of_string_utf8_get_position( _s->cString + range.location, range.length, _s->cStringLength - range.location); } if ([string UTF8StringLength] > range.length) return; newCString = NULL; newCStringLength = 0; newLength = _s->length; last = 0; for (i = range.location; i <= range.length - searchLength; i++) { if (memcmp(_s->cString + i, searchString, searchLength)) continue; @try { newCString = [self resizeMemory: newCString size: newCStringLength + i - last + replacementLength + 1]; } @catch (id e) { [self freeMemory: newCString]; @throw e; } memcpy(newCString + newCStringLength, _s->cString + last, i - last); memcpy(newCString + newCStringLength + i - last, replacementString, replacementLength); newCStringLength += i - last + replacementLength; newLength = newLength - [string length] + [replacement length]; i += searchLength - 1; last = i + 1; } @try { newCString = [self resizeMemory: newCString size: newCStringLength + _s->cStringLength - last + 1]; } @catch (id e) { [self freeMemory: newCString]; @throw e; } memcpy(newCString + newCStringLength, _s->cString + last, _s->cStringLength - last); newCStringLength += _s->cStringLength - last; newCString[newCStringLength] = 0; [self freeMemory: _s->cString]; _s->hashed = false; _s->cString = newCString; _s->cStringLength = newCStringLength; _s->length = newLength; } - (void)deleteLeadingWhitespaces { size_t i; for (i = 0; i < _s->cStringLength; i++) if (_s->cString[i] != ' ' && _s->cString[i] != '\t' && _s->cString[i] != '\n' && _s->cString[i] != '\r' && _s->cString[i] != '\f') break; _s->hashed = false; _s->cStringLength -= i; _s->length -= i; memmove(_s->cString, _s->cString + i, _s->cStringLength); _s->cString[_s->cStringLength] = '\0'; @try { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { /* We don't really care, as we only made it smaller */ } } - (void)deleteTrailingWhitespaces { size_t d; char *p; _s->hashed = false; d = 0; for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) { if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' && *p != '\f') break; *p = '\0'; d++; } _s->cStringLength -= d; _s->length -= d; @try { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { /* We don't really care, as we only made it smaller */ } } - (void)deleteEnclosingWhitespaces { size_t d, i; char *p; _s->hashed = false; d = 0; for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) { if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' && *p != '\f') break; *p = '\0'; d++; } _s->cStringLength -= d; _s->length -= d; for (i = 0; i < _s->cStringLength; i++) if (_s->cString[i] != ' ' && _s->cString[i] != '\t' && _s->cString[i] != '\n' && _s->cString[i] != '\r' && _s->cString[i] != '\f') break; _s->cStringLength -= i; _s->length -= i; memmove(_s->cString, _s->cString + i, _s->cStringLength); _s->cString[_s->cStringLength] = '\0'; @try { _s->cString = [self resizeMemory: _s->cString size: _s->cStringLength + 1]; } @catch (OFOutOfMemoryException *e) { /* We don't really care, as we only made it smaller */ } } - (void)makeImmutable { object_setClass(self, [OFString_UTF8 class]); } @end