ObjFW  Artifact [7fb8de4d28]

Artifact 7fb8de4d2864368db8c69e5a65e58d1cdb6c9f986b8b8e7a1205d2749a2df37e:

  • File src/OFMutableString_UTF8.m — part of check-in [3d16a30f41] at 2013-06-22 12:12:36 on branch trunk — Rework exceptions.

    This mostly removes the argument for the class in which the exception
    occurred. As backtraces were recently added for all platforms, the
    passed class does not give any extra information on where the exception
    occurred anymore.

    This also removes a few other arguments which were not too helpful. In
    the past, the idea was to pass as many arguments as possible so that it
    is easier to find the origin of the exception. However, as backtraces
    are a much better way to find the origin, those are not useful anymore
    and just make the exception more cumbersome to use. The rule is now to
    only pass arguments that might help in recovering from the exception or
    provide information that is otherwise not easily accessible. (user: js, size: 19565) [annotate] [blame] [check-ins using]


/*
 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013
 *   Jonathan Schleifer <js@webkeks.org>
 *
 * All rights reserved.
 *
 * This file is part of ObjFW. It may be distributed under the terms of the
 * Q Public License 1.0, which can be found in the file LICENSE.QPL included in
 * the packaging of this file.
 *
 * Alternatively, it may be distributed under the terms of the GNU General
 * Public License, either version 2 or 3, which can be found in the file
 * LICENSE.GPLv2 or LICENSE.GPLv3 respectively included in the packaging of this
 * file.
 */

#include "config.h"

#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#import "OFString.h"
#import "OFString_UTF8.h"
#import "OFMutableString_UTF8.h"

#import "OFInvalidArgumentException.h"
#import "OFInvalidEncodingException.h"
#import "OFInvalidFormatException.h"
#import "OFOutOfMemoryException.h"
#import "OFOutOfRangeException.h"

#import "autorelease.h"
#import "macros.h"

#import "of_asprintf.h"
#import "unicode.h"

@implementation OFMutableString_UTF8
+ (void)initialize
{
	if (self == [OFMutableString_UTF8 class])
		[self inheritMethodsFromClass: [OFString_UTF8 class]];
}

- initWithUTF8StringNoCopy: (char*)UTF8String
	      freeWhenDone: (bool)freeWhenDone
{
	@try {
		self = [self initWithUTF8String: UTF8String];
	} @finally {
		if (freeWhenDone)
			free(UTF8String);
	}

	return self;
}

- (void)OF_convertWithWordStartTable: (const of_unichar_t *const[])startTable
		     wordMiddleTable: (const of_unichar_t *const[])middleTable
		  wordStartTableSize: (size_t)startTableSize
		 wordMiddleTableSize: (size_t)middleTableSize
{
	of_unichar_t *unicodeString;
	size_t unicodeLen, newCStringLength;
	size_t i, j;
	char *newCString;
	bool isStart = true;

	if (!_s->isUTF8) {
		uint8_t t;
		const of_unichar_t *const *table;

		assert(startTableSize >= 1 && middleTableSize >= 1);

		_s->hashed = false;

		for (i = 0; i < _s->cStringLength; i++) {
			if (isStart)
				table = startTable;
			else
				table = middleTable;

			switch (_s->cString[i]) {
			case ' ':
			case '\t':
			case '\n':
			case '\r':
				isStart = true;
				break;
			default:
				isStart = false;
				break;
			}

			if ((t = table[0][(uint8_t)_s->cString[i]]) != 0)
				_s->cString[i] = t;
		}

		return;
	}

	unicodeLen = [self length];
	unicodeString = [self allocMemoryWithSize: sizeof(of_unichar_t)
					    count: unicodeLen];

	i = j = 0;
	newCStringLength = 0;

	while (i < _s->cStringLength) {
		const of_unichar_t *const *table;
		size_t tableSize;
		of_unichar_t c;
		size_t cLen;

		if (isStart) {
			table = startTable;
			tableSize = middleTableSize;
		} else {
			table = middleTable;
			tableSize = middleTableSize;
		}

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen == 0 || c > 0x10FFFF) {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		switch (c) {
		case ' ':
		case '\t':
		case '\n':
		case '\r':
			isStart = true;
			break;
		default:
			isStart = false;
			break;
		}

		if (c >> 8 < tableSize) {
			of_unichar_t tc = table[c >> 8][c & 0xFF];

			if (tc)
				c = tc;
		}
		unicodeString[j++] = c;

		if (c < 0x80)
			newCStringLength++;
		else if (c < 0x800)
			newCStringLength += 2;
		else if (c < 0x10000)
			newCStringLength += 3;
		else if (c < 0x110000)
			newCStringLength += 4;
		else {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		i += cLen;
	}

	@try {
		newCString = [self allocMemoryWithSize: newCStringLength + 1];
	} @catch (id e) {
		[self freeMemory: unicodeString];
		@throw e;
	}

	j = 0;

	for (i = 0; i < unicodeLen; i++) {
		size_t d;

		if ((d = of_string_utf8_encode(unicodeString[i],
		    newCString + j)) == 0) {
			[self freeMemory: unicodeString];
			[self freeMemory: newCString];
			@throw [OFInvalidEncodingException exception];
		}
		j += d;
	}

	assert(j == newCStringLength);
	newCString[j] = 0;
	[self freeMemory: unicodeString];

	[self freeMemory: _s->cString];
	_s->hashed = false;
	_s->cString = newCString;
	_s->cStringLength = newCStringLength;

	/*
	 * Even though cStringLength can change, length cannot, therefore no
	 * need to change it.
	 */
}

- (void)setCharacter: (of_unichar_t)character
	     atIndex: (size_t)index
{
	char buffer[4];
	of_unichar_t c;
	size_t lenNew, lenOld;

	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	if (index > _s->cStringLength)
		@throw [OFOutOfRangeException exception];

	/* Shortcut if old and new character both are ASCII */
	if (!(character & 0x80) && !(_s->cString[index] & 0x80)) {
		_s->hashed = false;
		_s->cString[index] = character;
		return;
	}

	if ((lenNew = of_string_utf8_encode(character, buffer)) == 0)
		@throw [OFInvalidEncodingException exception];

	if ((lenOld = of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &c)) == 0)
		@throw [OFInvalidEncodingException exception];

	_s->hashed = false;

	if (lenNew == lenOld)
		memcpy(_s->cString + index, buffer, lenNew);
	else if (lenNew > lenOld) {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength -
						  lenOld + lenNew + 1];

		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		if (character & 0x80)
			_s->isUTF8 = true;
	} else if (lenNew < lenOld) {
		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		@try {
			_s->cString = [self
			    resizeMemory: _s->cString
				    size: _s->cStringLength + 1];
		} @catch (OFOutOfMemoryException *e) {
			/* We don't really care, as we only made it smaller */
		}
	} else
		assert(0);
}

- (void)appendUTF8String: (const char*)UTF8String
{
	size_t UTF8StringLength = strlen(UTF8String);
	size_t length;

	if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) {
		UTF8String += 3;
		UTF8StringLength -= 3;
	}

	switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) {
	case 1:
		_s->isUTF8 = true;
		break;
	case -1:
		@throw [OFInvalidEncodingException exception];
	}

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, UTF8String,
	    UTF8StringLength + 1);

	_s->cStringLength += UTF8StringLength;
	_s->length += length;
}

- (void)appendUTF8String: (const char*)UTF8String
		  length: (size_t)UTF8StringLength
{
	size_t length;

	if (UTF8StringLength >= 3 && !memcmp(UTF8String, "\xEF\xBB\xBF", 3)) {
		UTF8String += 3;
		UTF8StringLength -= 3;
	}

	switch (of_string_utf8_check(UTF8String, UTF8StringLength, &length)) {
	case 1:
		_s->isUTF8 = true;
		break;
	case -1:
		@throw [OFInvalidEncodingException exception];
	}

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, UTF8String, UTF8StringLength);

	_s->cStringLength += UTF8StringLength;
	_s->length += length;

	_s->cString[_s->cStringLength] = 0;
}

- (void)appendCString: (const char*)cString
	     encoding: (of_string_encoding_t)encoding
{
	return [self appendCString: cString
			  encoding: encoding
			    length: strlen(cString)];
}

- (void)appendCString: (const char*)cString
	     encoding: (of_string_encoding_t)encoding
	       length: (size_t)cStringLength
{
	if (encoding == OF_STRING_ENCODING_UTF_8)
		[self appendUTF8String: cString
				length: cStringLength];
	else {
		void *pool = objc_autoreleasePoolPush();
		[self appendString:
		    [OFString stringWithCString: cString
				       encoding: encoding
					 length: cStringLength]];
		objc_autoreleasePoolPop(pool);
	}
}

- (void)appendString: (OFString*)string
{
	size_t UTF8StringLength;

	if (string == nil)
		@throw [OFInvalidArgumentException exception];

	UTF8StringLength = [string UTF8StringLength];

	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: _s->cStringLength +
					  UTF8StringLength + 1];
	memcpy(_s->cString + _s->cStringLength, [string UTF8String],
	    UTF8StringLength);

	_s->cStringLength += UTF8StringLength;
	_s->length += [string length];

	_s->cString[_s->cStringLength] = 0;

	if ([string isKindOfClass: [OFString_UTF8 class]] ||
	    [string isKindOfClass: [OFMutableString_UTF8 class]]) {
		if (((OFString_UTF8*)string)->_s->isUTF8)
			_s->isUTF8 = true;
	} else
		_s->isUTF8 = true;
}

- (void)appendCharacters: (of_unichar_t*)characters
		  length: (size_t)length
{
	char *tmp;

	tmp = [self allocMemoryWithSize: (length * 4) + 1];
	@try {
		size_t i, j = 0;
		bool isUTF8 = false;

		for (i = 0; i < length; i++) {
			char buffer[4];

			switch (of_string_utf8_encode(characters[i], buffer)) {
			case 1:
				tmp[j++] = buffer[0];
				break;
			case 2:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 2);
				j += 2;

				break;
			case 3:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 3);
				j += 3;

				break;
			case 4:
				isUTF8 = true;

				memcpy(tmp + j, buffer, 4);
				j += 4;

				break;
			default:
				@throw [OFInvalidEncodingException exception];
			}
		}

		tmp[j] = '\0';

		_s->hashed = false;
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + j + 1];
		memcpy(_s->cString + _s->cStringLength, tmp, j + 1);

		_s->cStringLength += j;
		_s->length += length;

		if (isUTF8)
			_s->isUTF8 = true;
	} @finally {
		[self freeMemory: tmp];
	}
}

- (void)appendFormat: (OFConstantString*)format
	   arguments: (va_list)arguments
{
	char *UTF8String;
	int UTF8StringLength;

	if (format == nil)
		@throw [OFInvalidArgumentException exception];

	if ((UTF8StringLength = of_vasprintf(&UTF8String, [format UTF8String],
	    arguments)) == -1)
		@throw [OFInvalidFormatException exception];

	@try {
		[self appendUTF8String: UTF8String
				length: UTF8StringLength];
	} @finally {
		free(UTF8String);
	}
}

- (void)reverse
{
	size_t i, j;

	_s->hashed = false;

	/* We reverse all bytes and restore UTF-8 later, if necessary */
	for (i = 0, j = _s->cStringLength - 1; i < _s->cStringLength / 2;
	    i++, j--) {
		_s->cString[i] ^= _s->cString[j];
		_s->cString[j] ^= _s->cString[i];
		_s->cString[i] ^= _s->cString[j];
	}

	if (!_s->isUTF8)
		return;

	for (i = 0; i < _s->cStringLength; i++) {
		/* ASCII */
		if OF_LIKELY (!(_s->cString[i] & 0x80))
			continue;

		/* A start byte can't happen first as we reversed everything */
		if OF_UNLIKELY (_s->cString[i] & 0x40)
			@throw [OFInvalidEncodingException exception];

		/* Next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 1 ||
		    !(_s->cString[i + 1] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 1] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 1];
			_s->cString[i + 1] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 1];

			i++;
			continue;
		}

		/* Second next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 2 ||
		    !(_s->cString[i + 2] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Second next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 2] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 2];
			_s->cString[i + 2] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 2];

			i += 2;
			continue;
		}

		/* Third next byte must not be ASCII */
		if OF_UNLIKELY (_s->cStringLength < i + 3 ||
		    !(_s->cString[i + 3] & 0x80))
			@throw [OFInvalidEncodingException exception];

		/* Third next byte is the start byte */
		if OF_LIKELY (_s->cString[i + 3] & 0x40) {
			_s->cString[i] ^= _s->cString[i + 3];
			_s->cString[i + 3] ^= _s->cString[i];
			_s->cString[i] ^= _s->cString[i + 3];

			_s->cString[i + 1] ^= _s->cString[i + 2];
			_s->cString[i + 2] ^= _s->cString[i + 1];
			_s->cString[i + 1] ^= _s->cString[i + 2];

			i += 3;
			continue;
		}

		/* UTF-8 does not allow more than 4 bytes per character */
		@throw [OFInvalidEncodingException exception];
	}
}

- (void)insertString: (OFString*)string
	     atIndex: (size_t)index
{
	size_t newCStringLength;

	if (index > _s->length)
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	newCStringLength = _s->cStringLength + [string UTF8StringLength];
	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: newCStringLength + 1];

	memmove(_s->cString + index + [string UTF8StringLength],
	    _s->cString + index, _s->cStringLength - index);
	memcpy(_s->cString + index, [string UTF8String],
	    [string UTF8StringLength]);
	_s->cString[newCStringLength] = '\0';

	_s->cStringLength = newCStringLength;
	_s->length += [string length];

	if ([string isKindOfClass: [OFString_UTF8 class]] ||
	    [string isKindOfClass: [OFMutableString_UTF8 class]]) {
		if (((OFString_UTF8*)string)->_s->isUTF8)
			_s->isUTF8 = true;
	} else
		_s->isUTF8 = true;
}

- (void)deleteCharactersInRange: (of_range_t)range
{
	size_t start = range.location;
	size_t end = range.location + range.length;

	if (range.length > SIZE_MAX - range.location || end > _s->length)
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8) {
		start = of_string_utf8_get_position(_s->cString, start,
		    _s->cStringLength);
		end = of_string_utf8_get_position(_s->cString, end,
		    _s->cStringLength);
	}

	memmove(_s->cString + start, _s->cString + end,
	    _s->cStringLength - end);
	_s->hashed = false;
	_s->length -= range.length;
	_s->cStringLength -= end - start;
	_s->cString[_s->cStringLength] = 0;

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)replaceCharactersInRange: (of_range_t)range
		      withString: (OFString*)replacement
{
	size_t start = range.location;
	size_t end = range.location + range.length;
	size_t newCStringLength, newLength;

	if (range.length > SIZE_MAX - range.location || end > _s->length)
		@throw [OFOutOfRangeException exception];

	newLength = _s->length - range.length + [replacement length];

	if (_s->isUTF8) {
		start = of_string_utf8_get_position(_s->cString, start,
		    _s->cStringLength);
		end = of_string_utf8_get_position(_s->cString, end,
		    _s->cStringLength);
	}

	newCStringLength = _s->cStringLength - (end - start) +
	    [replacement UTF8StringLength];
	_s->hashed = false;
	_s->cString = [self resizeMemory: _s->cString
				    size: newCStringLength + 1];

	memmove(_s->cString + start + [replacement UTF8StringLength],
	    _s->cString + end, _s->cStringLength - end);
	memcpy(_s->cString + start, [replacement UTF8String],
	    [replacement UTF8StringLength]);
	_s->cString[newCStringLength] = '\0';

	_s->cStringLength = newCStringLength;
	_s->length = newLength;
}

- (void)replaceOccurrencesOfString: (OFString*)string
			withString: (OFString*)replacement
			   options: (int)options
			     range: (of_range_t)range
{
	const char *searchString = [string UTF8String];
	const char *replacementString = [replacement UTF8String];
	size_t searchLength = [string UTF8StringLength];
	size_t replacementLength = [replacement UTF8StringLength];
	size_t i, last, newCStringLength, newLength;
	char *newCString;

	if (range.length > SIZE_MAX - range.location ||
	    range.location + range.length > [self length])
		@throw [OFOutOfRangeException exception];

	if (_s->isUTF8) {
		range.location = of_string_utf8_get_position(_s->cString,
		    range.location, _s->cStringLength);
		range.length = of_string_utf8_get_position(
		    _s->cString + range.location, range.length,
		    _s->cStringLength - range.location);
	}

	if ([string UTF8StringLength] > range.length)
		return;

	newCString = NULL;
	newCStringLength = 0;
	newLength = _s->length;

	last = 0;
	for (i = range.location; i <= range.length - searchLength; i++) {
		if (memcmp(_s->cString + i, searchString, searchLength))
			continue;

		@try {
			newCString = [self
			    resizeMemory: newCString
				    size: newCStringLength + i - last +
					  replacementLength + 1];
		} @catch (id e) {
			[self freeMemory: newCString];
			@throw e;
		}
		memcpy(newCString + newCStringLength, _s->cString + last,
		    i - last);
		memcpy(newCString + newCStringLength + i - last,
		    replacementString, replacementLength);

		newCStringLength += i - last + replacementLength;
		newLength = newLength - [string length] + [replacement length];

		i += searchLength - 1;
		last = i + 1;
	}

	@try {
		newCString = [self resizeMemory: newCString
					   size: newCStringLength +
						 _s->cStringLength - last + 1];
	} @catch (id e) {
		[self freeMemory: newCString];
		@throw e;
	}
	memcpy(newCString + newCStringLength, _s->cString + last,
	    _s->cStringLength - last);
	newCStringLength += _s->cStringLength - last;
	newCString[newCStringLength] = 0;

	[self freeMemory: _s->cString];
	_s->hashed = false;
	_s->cString = newCString;
	_s->cStringLength = newCStringLength;
	_s->length = newLength;
}

- (void)deleteLeadingWhitespaces
{
	size_t i;

	for (i = 0; i < _s->cStringLength; i++)
		if (_s->cString[i] != ' '  && _s->cString[i] != '\t' &&
		    _s->cString[i] != '\n' && _s->cString[i] != '\r' &&
		    _s->cString[i] != '\f')
			break;

	_s->hashed = false;
	_s->cStringLength -= i;
	_s->length -= i;

	memmove(_s->cString, _s->cString + i, _s->cStringLength);
	_s->cString[_s->cStringLength] = '\0';

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)deleteTrailingWhitespaces
{
	size_t d;
	char *p;

	_s->hashed = false;

	d = 0;
	for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) {
		if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' &&
		    *p != '\f')
			break;

		*p = '\0';
		d++;
	}

	_s->cStringLength -= d;
	_s->length -= d;

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)deleteEnclosingWhitespaces
{
	size_t d, i;
	char *p;

	_s->hashed = false;

	d = 0;
	for (p = _s->cString + _s->cStringLength - 1; p >= _s->cString; p--) {
		if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\r' &&
		    *p != '\f')
			break;

		*p = '\0';
		d++;
	}

	_s->cStringLength -= d;
	_s->length -= d;

	for (i = 0; i < _s->cStringLength; i++)
		if (_s->cString[i] != ' '  && _s->cString[i] != '\t' &&
		    _s->cString[i] != '\n' && _s->cString[i] != '\r' &&
		    _s->cString[i] != '\f')
			break;

	_s->cStringLength -= i;
	_s->length -= i;

	memmove(_s->cString, _s->cString + i, _s->cStringLength);
	_s->cString[_s->cStringLength] = '\0';

	@try {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't really care, as we only made it smaller */
	}
}

- (void)makeImmutable
{
	object_setClass(self, [OFString_UTF8 class]);
}
@end