ObjFW  Check-in [5e2ef97c35]

Overview
Comment:Change of_string_utf8_decode() API

It now returns <= 0 on error, with negative values being the number of
bytes it would have needed * -1.

This can be used to detect cut off and how many bytes are missing.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 5e2ef97c35ba3911a76fe45afb629244b10f893f01be2e60163c1a82442548ce
User & Date: js on 2016-03-13 11:19:13
Other Links: manifest | tags
Context
2016-03-13
12:39
OFRunLoop: Tiny documentation improvement check-in: d4555b9c1a user: js tags: trunk
11:19
Change of_string_utf8_decode() API check-in: 5e2ef97c35 user: js tags: trunk
10:24
OFStdIOStream_Win32Console: Improve reading check-in: 566d4df603 user: js tags: trunk
Changes

Modified src/OFMutableString_UTF8.m from [3b7cce9749] to [f55475cd25].

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
	i = j = 0;
	newCStringLength = 0;

	while (i < _s->cStringLength) {
		const of_unichar_t *const *table;
		size_t tableSize;
		of_unichar_t c;
		size_t cLen;

		if (isStart) {
			table = startTable;
			tableSize = middleTableSize;
		} else {
			table = middleTable;
			tableSize = middleTableSize;
		}

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen == 0 || c > 0x10FFFF) {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		switch (c) {
		case ' ':
		case '\t':







|












|







105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
	i = j = 0;
	newCStringLength = 0;

	while (i < _s->cStringLength) {
		const of_unichar_t *const *table;
		size_t tableSize;
		of_unichar_t c;
		ssize_t cLen;

		if (isStart) {
			table = startTable;
			tableSize = middleTableSize;
		} else {
			table = middleTable;
			tableSize = middleTableSize;
		}

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen <= 0 || c > 0x10FFFF) {
			[self freeMemory: unicodeString];
			@throw [OFInvalidEncodingException exception];
		}

		switch (c) {
		case ' ':
		case '\t':
200
201
202
203
204
205
206
207

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
}

- (void)setCharacter: (of_unichar_t)character
	     atIndex: (size_t)index
{
	char buffer[4];
	of_unichar_t c;
	size_t lenNew, lenOld;


	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	if (index > _s->cStringLength)
		@throw [OFOutOfRangeException exception];

	/* Shortcut if old and new character both are ASCII */
	if (!(character & 0x80) && !(_s->cString[index] & 0x80)) {
		_s->hashed = false;
		_s->cString[index] = character;
		return;
	}

	if ((lenNew = of_string_utf8_encode(character, buffer)) == 0)
		@throw [OFInvalidEncodingException exception];

	if ((lenOld = of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &c)) == 0)
		@throw [OFInvalidEncodingException exception];

	_s->hashed = false;

	if (lenNew == lenOld)
		memcpy(_s->cString + index, buffer, lenNew);
	else if (lenNew > lenOld) {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength -
						  lenOld + lenNew + 1];

		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		if (character & 0x80)
			_s->isUTF8 = true;
	} else if (lenNew < lenOld) {
		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;







|
>



















|




|

|















|







200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
}

- (void)setCharacter: (of_unichar_t)character
	     atIndex: (size_t)index
{
	char buffer[4];
	of_unichar_t c;
	size_t lenNew;
	ssize_t lenOld;

	if (_s->isUTF8)
		index = of_string_utf8_get_position(_s->cString, index,
		    _s->cStringLength);

	if (index > _s->cStringLength)
		@throw [OFOutOfRangeException exception];

	/* Shortcut if old and new character both are ASCII */
	if (!(character & 0x80) && !(_s->cString[index] & 0x80)) {
		_s->hashed = false;
		_s->cString[index] = character;
		return;
	}

	if ((lenNew = of_string_utf8_encode(character, buffer)) == 0)
		@throw [OFInvalidEncodingException exception];

	if ((lenOld = of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &c)) <= 0)
		@throw [OFInvalidEncodingException exception];

	_s->hashed = false;

	if (lenNew == (size_t)lenOld)
		memcpy(_s->cString + index, buffer, lenNew);
	else if (lenNew > (size_t)lenOld) {
		_s->cString = [self resizeMemory: _s->cString
					    size: _s->cStringLength -
						  lenOld + lenNew + 1];

		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;
		_s->cString[_s->cStringLength] = '\0';

		if (character & 0x80)
			_s->isUTF8 = true;
	} else if (lenNew < (size_t)lenOld) {
		memmove(_s->cString + index + lenNew,
		    _s->cString + index + lenOld,
		    _s->cStringLength - index - lenOld);
		memcpy(_s->cString + index, buffer, lenNew);

		_s->cStringLength -= lenOld;
		_s->cStringLength += lenNew;

Modified src/OFStdIOStream_Win32Console.m from [88dea4ced5] to [f55d77f8b8].

39
40
41
42
43
44
45

46
47
48
49
50
51
52

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"
#import "OFStdIOStream+Private.h"

#import "OFDataArray.h"

#import "OFInvalidArgumentException.h"
#import "OFInvalidEncodingException.h"
#import "OFOutOfRangeException.h"
#import "OFReadFailedException.h"
#import "OFWriteFailedException.h"







>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"
#import "OFStdIOStream+Private.h"
#import "OFString.h"
#import "OFDataArray.h"

#import "OFInvalidArgumentException.h"
#import "OFInvalidEncodingException.h"
#import "OFOutOfRangeException.h"
#import "OFReadFailedException.h"
#import "OFWriteFailedException.h"
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
				  count: length * 2];
	@try {
		size_t i = 0, j = 0;
		DWORD written;

		while (i < length) {
			of_unichar_t c;
			size_t cLen;

			cLen = of_string_utf8_decode(buffer + i, length - i,
			    &c);

			if (cLen == 0 || c > 0x10FFFF)
				@throw [OFInvalidEncodingException exception];

			if (c > 0xFFFF) {
				c -= 0x10000;
				tmp[j++] = 0xD800 | (c >> 10);
				tmp[j++] = 0xDC00 | (c & 0x3FF);
			} else
				tmp[j++] = c;

			i += cLen;
		}

		if (!WriteConsoleW(_handle, tmp, j, &written, NULL) ||
		    written != j)
			@throw [OFWriteFailedException
			    exceptionWithObject: self
				requestedLength: j];
	} @finally {
		[self freeMemory: tmp];
	}
}
@end







|

|


|









|












220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
				  count: length * 2];
	@try {
		size_t i = 0, j = 0;
		DWORD written;

		while (i < length) {
			of_unichar_t c;
			size_t UTF8Len;

			UTF8Len = of_string_utf8_decode(buffer + i, length - i,
			    &c);

			if (UTF8Len <= 0 || c > 0x10FFFF)
				@throw [OFInvalidEncodingException exception];

			if (c > 0xFFFF) {
				c -= 0x10000;
				tmp[j++] = 0xD800 | (c >> 10);
				tmp[j++] = 0xDC00 | (c & 0x3FF);
			} else
				tmp[j++] = c;

			i += UTF8Len;
		}

		if (!WriteConsoleW(_handle, tmp, j, &written, NULL) ||
		    written != j)
			@throw [OFWriteFailedException
			    exceptionWithObject: self
				requestedLength: j];
	} @finally {
		[self freeMemory: tmp];
	}
}
@end

Modified src/OFString.h from [a580fc1ea1] to [f6c039afa4].

1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
#endif
@end

#ifdef __cplusplus
extern "C" {
#endif
extern size_t of_string_utf8_encode(of_unichar_t, char*);
extern size_t of_string_utf8_decode(const char*, size_t, of_unichar_t*);
extern size_t of_string_utf16_length(const of_char16_t*);
extern size_t of_string_utf32_length(const of_char32_t*);
#ifdef __cplusplus
}
#endif

OF_ASSUME_NONNULL_END







|







1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
#endif
@end

#ifdef __cplusplus
extern "C" {
#endif
extern size_t of_string_utf8_encode(of_unichar_t, char*);
extern ssize_t of_string_utf8_decode(const char*, size_t, of_unichar_t*);
extern size_t of_string_utf16_length(const of_char16_t*);
extern size_t of_string_utf32_length(const of_char32_t*);
#ifdef __cplusplus
}
#endif

OF_ASSUME_NONNULL_END

Modified src/OFString.m from [e3e6e72034] to [b713fffc4b].

124
125
126
127
128
129
130
131
132
133

134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
		buffer[i] = 0x80 | (character & 0x3F);
		return 4;
	}

	return 0;
}

size_t
of_string_utf8_decode(const char *buffer_, size_t length, of_unichar_t *ret)
{

	const uint8_t *buffer = (const uint8_t*)buffer_;

	if (!(*buffer & 0x80)) {
		*ret = buffer[0];
		return 1;
	}

	if ((*buffer & 0xE0) == 0xC0) {
		if OF_UNLIKELY (length < 2)
			return 0;

		*ret = ((buffer[0] & 0x1F) << 6) | (buffer[1] & 0x3F);
		return 2;
	}

	if ((*buffer & 0xF0) == 0xE0) {
		if OF_UNLIKELY (length < 3)
			return 0;

		*ret = ((buffer[0] & 0x0F) << 12) | ((buffer[1] & 0x3F) << 6) |
		    (buffer[2] & 0x3F);
		return 3;
	}

	if ((*buffer & 0xF8) == 0xF0) {
		if OF_UNLIKELY (length < 4)
			return 0;

		*ret = ((buffer[0] & 0x07) << 18) | ((buffer[1] & 0x3F) << 12) |
		    ((buffer[2] & 0x3F) << 6) | (buffer[3] & 0x3F);
		return 4;
	}

	return 0;







|


>









|







|








|







124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
		buffer[i] = 0x80 | (character & 0x3F);
		return 4;
	}

	return 0;
}

ssize_t
of_string_utf8_decode(const char *buffer_, size_t length, of_unichar_t *ret)
{
	/* FIXME: Check if the following bytes are indeed surrogates */
	const uint8_t *buffer = (const uint8_t*)buffer_;

	if (!(*buffer & 0x80)) {
		*ret = buffer[0];
		return 1;
	}

	if ((*buffer & 0xE0) == 0xC0) {
		if OF_UNLIKELY (length < 2)
			return -2;

		*ret = ((buffer[0] & 0x1F) << 6) | (buffer[1] & 0x3F);
		return 2;
	}

	if ((*buffer & 0xF0) == 0xE0) {
		if OF_UNLIKELY (length < 3)
			return -3;

		*ret = ((buffer[0] & 0x0F) << 12) | ((buffer[1] & 0x3F) << 6) |
		    (buffer[2] & 0x3F);
		return 3;
	}

	if ((*buffer & 0xF8) == 0xF0) {
		if OF_UNLIKELY (length < 4)
			return -4;

		*ret = ((buffer[0] & 0x07) << 18) | ((buffer[1] & 0x3F) << 12) |
		    ((buffer[2] & 0x3F) << 6) | (buffer[3] & 0x3F);
		return 4;
	}

	return 0;

Modified src/OFString_UTF8.m from [f9779f2392] to [c460dbfe91].

804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
			return OF_ORDERED_ASCENDING;
	}

	i = j = 0;

	while (i < _s->cStringLength && j < otherCStringLength) {
		of_unichar_t c1, c2;
		size_t l1, l2;

		l1 = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c1);
		l2 = of_string_utf8_decode(otherCString + j,
		    otherCStringLength - j, &c2);

		if (l1 == 0 || l2 == 0 || c1 > 0x10FFFF || c2 > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		if (c1 >> 8 < OF_UNICODE_CASEFOLDING_TABLE_SIZE) {
			of_unichar_t tc =
			    of_unicode_casefolding_table[c1 >> 8][c1 & 0xFF];

			if (tc)







|






|







804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
			return OF_ORDERED_ASCENDING;
	}

	i = j = 0;

	while (i < _s->cStringLength && j < otherCStringLength) {
		of_unichar_t c1, c2;
		ssize_t l1, l2;

		l1 = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c1);
		l2 = of_string_utf8_decode(otherCString + j,
		    otherCStringLength - j, &c2);

		if (l1 <= 0 || l2 <= 0 || c1 > 0x10FFFF || c2 > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		if (c1 >> 8 < OF_UNICODE_CASEFOLDING_TABLE_SIZE) {
			of_unichar_t tc =
			    of_unicode_casefolding_table[c1 >> 8][c1 & 0xFF];

			if (tc)
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
	if (_s->hashed)
		return _s->hash;

	OF_HASH_INIT(hash);

	for (size_t i = 0; i < _s->cStringLength; i++) {
		of_unichar_t c;
		size_t length;

		if ((length = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c)) == 0)
			@throw [OFInvalidEncodingException exception];

		OF_HASH_ADD(hash, (c & 0xFF0000) >> 16);
		OF_HASH_ADD(hash, (c & 0x00FF00) >>  8);
		OF_HASH_ADD(hash,  c & 0x0000FF);

		i += length - 1;







|


|







858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
	if (_s->hashed)
		return _s->hash;

	OF_HASH_INIT(hash);

	for (size_t i = 0; i < _s->cStringLength; i++) {
		of_unichar_t c;
		ssize_t length;

		if ((length = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c)) <= 0)
			@throw [OFInvalidEncodingException exception];

		OF_HASH_ADD(hash, (c & 0xFF0000) >> 16);
		OF_HASH_ADD(hash, (c & 0x00FF00) >>  8);
		OF_HASH_ADD(hash,  c & 0x0000FF);

		i += length - 1;
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907

	if (!_s->isUTF8)
		return _s->cString[index];

	index = of_string_utf8_get_position(_s->cString, index,
	    _s->cStringLength);

	if (!of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &character))
		@throw [OFInvalidEncodingException exception];

	return character;
}

- (void)getCharacters: (of_unichar_t*)buffer
	      inRange: (of_range_t)range







|
|







892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907

	if (!_s->isUTF8)
		return _s->cString[index];

	index = of_string_utf8_get_position(_s->cString, index,
	    _s->cStringLength);

	if (of_string_utf8_decode(_s->cString + index,
	    _s->cStringLength - index, &character) <= 0)
		@throw [OFInvalidEncodingException exception];

	return character;
}

- (void)getCharacters: (of_unichar_t*)buffer
	      inRange: (of_range_t)range
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
	ret = [object allocMemoryWithSize: sizeof(of_unichar_t)
				    count: _s->length];

	i = j = 0;

	while (i < _s->cStringLength) {
		of_unichar_t c;
		size_t cLen;

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen == 0 || c > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		ret[j++] = c;
		i += cLen;
	}

	return ret;







|




|







1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
	ret = [object allocMemoryWithSize: sizeof(of_unichar_t)
				    count: _s->length];

	i = j = 0;

	while (i < _s->cStringLength) {
		of_unichar_t c;
		ssize_t cLen;

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen <= 0 || c > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		ret[j++] = c;
		i += cLen;
	}

	return ret;
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
	ret = [object allocMemoryWithSize: sizeof(of_unichar_t)
				    count: _s->length + 1];

	i = j = 0;

	while (i < _s->cStringLength) {
		of_unichar_t c;
		size_t cLen;

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen == 0 || c > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		if (byteOrder != OF_BYTE_ORDER_NATIVE)
			ret[j++] = OF_BSWAP32(c);
		else
			ret[j++] = c;








|




|







1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
	ret = [object allocMemoryWithSize: sizeof(of_unichar_t)
				    count: _s->length + 1];

	i = j = 0;

	while (i < _s->cStringLength) {
		of_unichar_t c;
		ssize_t cLen;

		cLen = of_string_utf8_decode(_s->cString + i,
		    _s->cStringLength - i, &c);

		if (cLen <= 0 || c > 0x10FFFF)
			@throw [OFInvalidEncodingException exception];

		if (byteOrder != OF_BYTE_ORDER_NATIVE)
			ret[j++] = OF_BSWAP32(c);
		else
			ret[j++] = c;