ObjFW  Artifact [9492a9e3d8]

Artifact 9492a9e3d81d1ea24988020118c2be5f4453f59f688dde68ab23c6cfb52e6d22:

  • File src/OFMutableString_UTF8.m — part of check-in [4d892e0db1] at 2013-07-04 20:49:52 on branch trunk — Fix -[replaceCharactersInRange:withString:].

    The resizing is now done before the memmove() if the new string is
    bigger and after the memmove() if the new string is shorter. The added
    comment explains why this is necessary.

    This also adds a test for -[replaceCharactersInRange:withString:] that
    makes the string bigger and another one that makes it smaller again. (user: js, size: 20151) [annotate] [blame] [check-ins using]


/*
 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013
 *   Jonathan Schleifer <js@webkeks.org>
 *
 * All rights reserved.
 *
 * This file is part of ObjFW. It may be distributed under the terms of the
 * Q Public License 1.0, which can be found in the file LICENSE.QPL included in
 * the packaging of this file.
 *
 * Alternatively, it may be distributed under the terms of the GNU General
 * Public License, either version 2 or 3, which can be found in the file
 * LICENSE.GPLv2 or LICENSE.GPLv3 respectively included in the packaging of this
 * file.
 */

#include "config.h"

#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#import "OFString.h"
#import "OFString_UTF8.h"
#import "OFMutableString_UTF8.h"

#import "OFInvalidArgumentException.h"
#import "OFInvalidEncodingException.h"
#import "OFInvalidFormatException.h"
#import "OFOutOfMemoryException.h"
#import "OFOutOfRangeException.h"

#import "autorelease.h"
#import "macros.h"

#import "of_asprintf.h"
#import "unicode.h"

@implementation OFMutableString_UTF8
+ (void)initialize
{
	if (self == [OFMutableString_UTF8 class])
		[self inheritMethodsFromClass: [OFString_UTF8 class]];
}

- initWithUTF8StringNoCopy: (char*)UTF8String
	      freeWhenDone: (bool)freeWhenDone
{
	@try {
		self = [self initWithUTF8String: UTF8String];
	} @finally {
		if (freeWhenDone)
			free(UTF8String);
	}

	return self;
}

- (void)OF_convertWithWordStartTable: (const of_unichar_t *const[])startTable
		     wordMiddleTable: (const of_unichar_t *const[])middleTable
		  wordStartTableSize: (size_t)startTableSize
		 wordMiddleTableSize: (size_t)middleTableSize
{
	of_unichar_t *unicodeString;
	size_t unicodeLen, newCStringLength;
	size_t i, j;
	char *newCString;
	bool isStart = true;

	if (!_s->isUTF8) {
		uint8_t t;
		const of_unichar_t *const *table;

		assert(startTableSize >= 1 && middleTableSize >= 1);

		_s->hashed = false;

		for (i = 0; i < _s->cStringLength; i++) {
			if (isStart)
				table = startTable;
			else
				table = middleTable;

			switch (_s->cString[i]) {
			case ' ':
			case '\t':
			case '\n':
			case '\r':
				isStart = true;
				break;
			default:
				isStart = false;
				break;
			}

			if ((t = table[0][(uint8_t)_s->cString[i]]) != 0)
				_s->cString[i] = t;
		}

		return;
	}

	unicodeLen = [self length];
	unicodeString = [self allocMemoryWithSize: sizeof(of_unichar_t)
					    count: unicodeLen];

	i = j = 0;
	newCStringLength = 0;

	while (i < _s->cStringLength) {
		const of_unichar_t *const *table;
		size_t tableSize;
		of_unichar_t c;
		size_t cLen;

		if (isStart) {
			table = startTable;
			tableSize = middleTableSize;
		} else {
			table = middleTable;
			tableSize = middleTableSize;
		}

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen == 0 || c > 0x10FFFF) {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		switch (c) {
		case ' ':
		case '\t':
		case '\n':
		case '\r':
			isStart = true;
			break;
		default:
			isStart = false;
			break;
		}

		if (c >> 8 < tableSize) {
			of_unichar_t tc = table[c >> 8][c & 0xFF];

			if (tc)
				c = tc;
		}
		unicodeString[j++] = c;

		if (c < 0x80)
			newCStringLength++;
		else if (c < 0x800)
			newCStringLength += 2;
		else if (c < 0x10000)
			newCStringLength += 3;
		else if (c < 0x110000)
			newCStringLength += 4;
		else {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		i += cLen;
	}

	@try {
		newCString = [self allocMemoryWithSize: newCStringLength + 1];
	} @catch (id e) {
		[self freeMemory: unicodeString];
		@throw e;
	}

	j = 0;

	for (i = 0; i < unicodeLen; i++) {
		size_t d;

		if ((d = of_string_utf8_encode(unicodeString[i],
		    newCString + j)) == 0) {
			[self freeMemory: unicodeString];
			[self freeMemory: newCString];
			@throw [OFInvalidEncodingException exception];
		}
		j += d;
	}

	assert(j == newCStringLength);
	newCString[j] = 0;
	[self freeMemory: unicodeString];

	[self freeMemory: _s->cString];
	_s->hashed = false;
	_s->cString = newCString;
	_s->cStringLength = newCStringLength;

	/*
	 * Even though cStringLength can change, length cannot, therefore no
	 * need to change it.
	 */
}

- (void)setCharacter: (of_unichar_t)character
	     atIndex: (size_t)index
{
	char buffer[4];
	of_unichar_t c;
	size_t lenNew, lenOld;

	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	if (index > _s->cStringLength)
		@throw [OFOutOfRangeException exception];

	/* Shortcut if old and new character both are ASCII */
	if (!(character & 0x80) && !(_s->cString[index] & 0x80)) {
		_s->hashed = false;
		_s->cString[index] = character;
		return;
	}

	if ((lenNew = of_string_utf8_encode(character, buffer)) == 0)
		@throw [OFInvalidEncodingException exception];

	if ((lenOld = of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &c)) == 0)
		@throw [OFInvalidEncodingException exception];

	_s->hashed = false;

	if (lenNew == lenOld)
		memcpy(_s->cString + index, buffer, lenNew);
	else if (lenNew > lenOld) {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength -
						  lenOld + lenNew + 1];

		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		if (character & 0x80)
			_s->isUTF8 = true;
	} else if (lenNew < lenOld) {
		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		@try {
			_s->cString = [self
			    resizeMemory: _s->cString
				    size: _s->cStringLength + 1];
		} @catch (OFOutOfMemoryException *e) {
			/* We don't really care, as we only made it smaller */
		}
	} else
		assert(0);
}

- (void)appendUTF8String: (const char*)UTF8String
{
	size_t UTF8StringLength = strlen(UTF8String);
	size_t length;

	if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) {
		UTF8String += 3;
		UTF8StringLength -= 3;
	}

	switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) {
	case 1:
		_s->isUTF8 = true;
		break;
	case -1:
		@throw [OFInvalidEncodingException exception];
	}

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, UTF8String,
	    UTF8StringLength + 1);

	_s->cStringLength += UTF8StringLength;
	_s->length += length;
}

- (void)appendUTF8String: (const char*)UTF8String
		  length: (size_t)UTF8StringLength
{
	size_t length;

	if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) {
		UTF8String += 3;
		UTF8StringLength -= 3;
	}

	switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) {
	case 1:
		_s->isUTF8 = true;
		break;
	case -1:
		@throw [OFInvalidEncodingException exception];
	}

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, UTF8String, UTF8StringLength);

	_s->cStringLength += UTF8StringLength;
	_s->length += length;

	_s->cString[_s->cStringLength] = 0;
}

- (void)appendCString: (const char*)cString
	     encoding: (of_string_encoding_t)encoding
{
	return [self appendCString: cString
			  encoding: encoding
			    length: strlen(cString)];
}

- (void)appendCString: (const char*)cString
	     encoding: (of_string_encoding_t)encoding
	       length: (size_t)cStringLength
{
	if (encoding == OF_STRING_ENCODING_UTF_8)
		[self appendUTF8String: cString
				length: cStringLength];
	else {
		void *pool = objc_autoreleasePoolPush();
		[self appendString:
		    [OFString stringWithCString: cString
				       encoding: encoding
					 length: cStringLength]];
		objc_autoreleasePoolPop(pool);
	}
}

- (void)appendString: (OFString*)string
{
	size_t UTF8StringLength;

	if (string == nil)
		@throw [OFInvalidArgumentException exception];

	UTF8StringLength = [string UTF8StringLength];

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, [string UTF8String],
	    UTF8StringLength);

	_s->cStringLength += UTF8StringLength;
	_s->length += [string length];

	_s->cString[_s->cStringLength] = 0;

	if ([string isKindOfClass: [OFString_UTF8 class]] ||
	    [string isKindOfClass: [OFMutableString_UTF8 class]]) {
		if (((OFString_UTF8*)string)->_s->isUTF8)
			_s->isUTF8 = true;
	} else
		_s->isUTF8 = true;
}

- (void)appendCharacters: (of_unichar_t*)characters
		  length: (size_t)length
{
	char *tmp;

	tmp = [self allocMemoryWithSize: (length * 4) + 1];
	@try {
		size_t i, j = 0;
		bool isUTF8 = false;

		for (i = 0; i < length; i++) {
			char buffer[4];

			switch (of_string_utf8_encode(characters[i], buffer)) {
			case 1:
				tmp[j++] = buffer[0];
				break;
			case 2:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 2);
				j += 2;

				break;
			case 3:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 3);
				j += 3;

				break;
			case 4:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 4);
				j += 4;

				break;
			default:
				@throw [OFInvalidEncodingException exception];
			}
		}

		tmp[j] = '\0';

		_s->hashed = false;
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + j + 1];
		memcpy(_s->cString + _s->cStringLength, tmp, j + 1);

		_s->cStringLength += j;
		_s->length += length;

		if (isUTF8)
			_s->isUTF8 = true;
	} @finally {
		[self freeMemory: tmp];
	}
}

- (void)appendFormat: (OFConstantString*)format
	   arguments: (va_list)arguments
{
	char *UTF8String;
	int UTF8StringLength;

	if (format == nil)
		@throw [OFInvalidArgumentException exception];

	if ((UTF8StringLength = of_vasprintf(&UTF8String, [format UTF8String],
	    arguments)) == -1)
		@throw [OFInvalidFormatException exception];

	@try {
		[self appendUTF8String: UTF8String
				length: UTF8StringLength];
	} @finally {
		free(UTF8String);
	}
}

- (void)reverse
{
	size_t i, j;

	_s->hashed = false;

	/* We reverse all bytes and restore UTF-8 later, if necessary */
	for (i = 0, j = _s->cStringLength - 1; i < _s->cStringLength / 2;
	    i++, j--) {
		_s->cString[i] ^= _s->cString[j];
		_s->cString[j] ^= _s->cString[i];
		_s->cString[i] ^= _s->cString[j];
	}

	if (!_s->isUTF8)
		return;

	for (i = 0; i < _s->cStringLength; i++) {
		/* ASCII */
		if OF_LIKELY (!(_s->cString[i] & 0x80))
			continue;

		/* A start byte can't happen first as we reversed everything */
		if OF_UNLIKELY (_s->cString[i] & 0x40)
			@throw [OFInvalidEncodingException exception];

		/* Next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 1 ||
		    !(_s->cString[i + 1] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 1] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 1];
			_s->cString[i + 1] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 1];

			i++;
			continue;
		}

		/* Second next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 2 ||
		    !(_s->cString[i + 2] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Second next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 2] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 2];
			_s->cString[i + 2] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 2];

			i += 2;
			continue;
		}

		/* Third next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 3 ||
		    !(_s->cString[i + 3] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Third next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 3] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 3];
			_s->cString[i + 3] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 3];

			_s->cString[i + 1] ^= _s->cString[i + 2];
			_s->cString[i + 2] ^= _s->cString[i + 1];
			_s->cString[i + 1] ^= _s->cString[i + 2];

			i += 3;
			continue;
		}

		/* UTF-8 does not allow more than 4 bytes per character */
		@throw [OFInvalidEncodingException exception];
	}
}

- (void)insertString: (OFString*)string
	     atIndex: (size_t)index
{
	size_t newCStringLength;

	if (index > _s->length)
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	newCStringLength = _s->cStringLength + [string UTF8StringLength];
	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: newCStringLength + 1];

	memmove(_s->cString + index + [string UTF8StringLength],
	    _s->cString + index, _s->cStringLength - index);
	memcpy(_s->cString + index, [string UTF8String],
	    [string UTF8StringLength]);
	_s->cString[newCStringLength] = '\0';

	_s->cStringLength = newCStringLength;
	_s->length += [string length];

	if ([string isKindOfClass: [OFString_UTF8 class]] ||
	    [string isKindOfClass: [OFMutableString_UTF8 class]]) {
		if (((OFString_UTF8*)string)->_s->isUTF8)
			_s->isUTF8 = true;
	} else
		_s->isUTF8 = true;
}

- (void)deleteCharactersInRange: (of_range_t)range
{
	size_t start = range.location;
	size_t end = range.location + range.length;

	if (range.length > SIZE_MAX - range.location || end > _s->length)
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8) {
		start = of_string_utf8_get_position(_s->cString, start,
		    _s->cStringLength);
		end = of_string_utf8_get_position(_s->cString, end,
		    _s->cStringLength);
	}

	memmove(_s->cString + start, _s->cString + end,
	    _s->cStringLength - end);
	_s->hashed = false;
	_s->length -= range.length;
	_s->cStringLength -= end - start;
	_s->cString[_s->cStringLength] = 0;

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)replaceCharactersInRange: (of_range_t)range
		      withString: (OFString*)replacement
{
	size_t start = range.location;
	size_t end = range.location + range.length;
	size_t newCStringLength, newLength;

	if (range.length > SIZE_MAX - range.location || end > _s->length)
		@throw [OFOutOfRangeException exception];

	newLength = _s->length - range.length + [replacement length];

	if (_s->isUTF8) {
		start = of_string_utf8_get_position(_s->cString, start,
		    _s->cStringLength);
		end = of_string_utf8_get_position(_s->cString, end,
		    _s->cStringLength);
	}

	newCStringLength = _s->cStringLength - (end - start) +
	    [replacement UTF8StringLength];
	_s->hashed = false;

	/*
	 * If the new string is bigger, we need to resize it first so we can
	 * memmove() the rest of the string to the end.
	 *
	 * We must not resize the string if the new string is smaller, because
	 * then we can't memove() the rest of the string forward as the rest is
	 * lost due to the resize!
	 */
	if (newCStringLength > _s->cStringLength)
		_s->cString = [self resizeMemory: _s->cString
					    size: newCStringLength + 1];

	memmove(_s->cString + start + [replacement UTF8StringLength],
	    _s->cString + end, _s->cStringLength - end);
	memcpy(_s->cString + start, [replacement UTF8String],
	    [replacement UTF8StringLength]);
	_s->cString[newCStringLength] = '\0';

	/*
	 * If the new string is smaller, we can safely resize it now as we're
	 * done with memmove().
	 */
	if (newCStringLength < _s->cStringLength)
		_s->cString = [self resizeMemory: _s->cString
					    size: newCStringLength + 1];

	_s->cStringLength = newCStringLength;
	_s->length = newLength;
}

- (void)replaceOccurrencesOfString: (OFString*)string
			withString: (OFString*)replacement
			   options: (int)options
			     range: (of_range_t)range
{
	const char *searchString = [string UTF8String];
	const char *replacementString = [replacement UTF8String];
	size_t searchLength = [string UTF8StringLength];
	size_t replacementLength = [replacement UTF8StringLength];
	size_t i, last, newCStringLength, newLength;
	char *newCString;

	if (range.length > SIZE_MAX - range.location ||
	    range.location + range.length > [self length])
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8) {
		range.location = of_string_utf8_get_position(_s->cString,
		    range.location, _s->cStringLength);
		range.length = of_string_utf8_get_position(
		    _s->cString + range.location, range.length,
		    _s->cStringLength - range.location);
	}

	if ([string UTF8StringLength] > range.length)
		return;

	newCString = NULL;
	newCStringLength = 0;
	newLength = _s->length;

	last = 0;
	for (i = range.location; i <= range.length - searchLength; i++) {
		if (memcmp(_s->cString + i, searchString, searchLength))
			continue;

		@try {
			newCString = [self
			    resizeMemory: newCString
				    size: newCStringLength + i - last +
					  replacementLength + 1];
		} @catch (id e) {
			[self freeMemory: newCString];
			@throw e;
		}
		memcpy(newCString + newCStringLength, _s->cString + last,
		    i - last);
		memcpy(newCString + newCStringLength + i - last,
		    replacementString, replacementLength);

		newCStringLength += i - last + replacementLength;
		newLength = newLength - [string length] + [replacement length];

		i += searchLength - 1;
		last = i + 1;
	}

	@try {
		newCString = [self resizeMemory: newCString
					   size: newCStringLength +
						 _s->cStringLength - last + 1];
	} @catch (id e) {
		[self freeMemory: newCString];
		@throw e;
	}
	memcpy(newCString + newCStringLength, _s->cString + last,
	    _s->cStringLength - last);
	newCStringLength += _s->cStringLength - last;
	newCString[newCStringLength] = 0;

	[self freeMemory: _s->cString];
	_s->hashed = false;
	_s->cString = newCString;
	_s->cStringLength = newCStringLength;
	_s->length = newLength;
}

- (void)deleteLeadingWhitespaces
{
	size_t i;

	for (i = 0; i < _s->cStringLength; i++)
		if (_s->cString[i] != ' '  && _s->cString[i] != '\t' &&
		    _s->cString[i] != '\n' && _s->cString[i] != '\r' &&
		    _s->cString[i] != '\f')
			break;

	_s->hashed = false;
	_s->cStringLength -= i;
	_s->length -= i;

	memmove(_s->cString, _s->cString + i, _s->cStringLength);
	_s->cString[_s->cStringLength] = '\0';

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)deleteTrailingWhitespaces
{
	size_t d;
	char *p;

	_s->hashed = false;

	d = 0;
	for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) {
		if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' &&
		    *p != '\f')
			break;

		*p = '\0';
		d++;
	}

	_s->cStringLength -= d;
	_s->length -= d;

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)deleteEnclosingWhitespaces
{
	size_t d, i;
	char *p;

	_s->hashed = false;

	d = 0;
	for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) {
		if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' &&
		    *p != '\f')
			break;

		*p = '\0';
		d++;
	}

	_s->cStringLength -= d;
	_s->length -= d;

	for (i = 0; i < _s->cStringLength; i++)
		if (_s->cString[i] != ' '  && _s->cString[i] != '\t' &&
		    _s->cString[i] != '\n' && _s->cString[i] != '\r' &&
		    _s->cString[i] != '\f')
			break;

	_s->cStringLength -= i;
	_s->length -= i;

	memmove(_s->cString, _s->cString + i, _s->cStringLength);
	_s->cString[_s->cStringLength] = '\0';

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)makeImmutable
{
	object_setClass(self, [OFString_UTF8 class]);
}
@end