ObjFW  Check-in [9d3cd5e5fe]

Overview
Comment:OFString: Zero-terminate UTF-16 strings.

This partly reverts e8502c7.

The rationale behind this is that, on some OSes, native APIs (on e.g.
Windows) take UTF-16 strings that are zero-terminated.

However, forcing zero-termination for every string so that -[characters]
returns a zero-terminated UTF-32 string does not make sense. Therefore,
in the future, -[UTF32String] will be added, which will include
zero-termination. OFString subclasses can then just return their
internal representation if it includes a terminating zero or create a
copy that has a terminating zero if not.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 9d3cd5e5fe7a26dc10fe5687da330eea1405737024203706b724c907b8535074
User & Date: js on 2013-01-07 22:07:58
Other Links: manifest | tags
Context
2013-01-07
23:42
Win32: Correctly handle Unicode in environment. check-in: 1fb00cc3b4 user: js tags: trunk
22:07
OFString: Zero-terminate UTF-16 strings. check-in: 9d3cd5e5fe user: js tags: trunk
14:49
OFProcess: Implement environment passing on Win32. check-in: f51bceaa35 user: js tags: trunk
Changes

Modified src/OFString.h from [ad8cbdf010] to [31ec44e41b].

158
159
160
161
162
163
164
165








166
167
168
169
170
171
172
173
174











175
176
177
178
179
180
181
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithCharacters: (const of_unichar_t*)characters
			      length: (size_t)length
			   byteOrder: (of_byte_order_t)byteOrder;

/*!
 * @brief Creates a new OFString from a big endian UTF-16 encoded string with








 *	  the specified length.
 *
 * @param string The UTF-16 string
 * @param length The length of the unicode string
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length;












/*!
 * @brief Creates a new OFString from a UTF-16 encoded string with the
 *	  specified length, assuming the specified byte order if no BOM is
 *	  found.
 *
 * @param string The UTF-16 string
 * @param length The length of the unicode string







|
>
>
>
>
>
>
>
>
|








>
>
>
>
>
>
>
>
>
>
>







158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithCharacters: (const of_unichar_t*)characters
			      length: (size_t)length
			   byteOrder: (of_byte_order_t)byteOrder;

/*!
 * @brief Creates a new OFString from a UTF-16 encoded string.
 *
 * @param string The UTF-16 string
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithUTF16String: (const uint16_t*)string;

/*!
 * @brief Creates a new OFString from a UTF-16 encoded string with the specified
 *	  length.
 *
 * @param string The UTF-16 string
 * @param length The length of the unicode string
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length;

/*!
 * @brief Creates a new OFString from a UTF-16 encoded string, assuming the
 *	  specified byte order if no BOM is found.
 *
 * @param string The UTF-16 string
 * @param byteOrder The byte order to assume if there is no BOM
 * @return A new autoreleased OFString
 */
+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			    byteOrder: (of_byte_order_t)byteOrder;

/*!
 * @brief Creates a new OFString from a UTF-16 encoded string with the
 *	  specified length, assuming the specified byte order if no BOM is
 *	  found.
 *
 * @param string The UTF-16 string
 * @param length The length of the unicode string
337
338
339
340
341
342
343








344
345
346
347
348
349
350
351
352
353
354











355
356
357
358
359
360
361
 * @param byteOrder The byte order to assume if there is no BOM
 * @return An initialized OFString
 */
- initWithCharacters: (const of_unichar_t*)characters
	      length: (size_t)length
	   byteOrder: (of_byte_order_t)byteOrder;









/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string with
 *	  the specified length.
 *
 * @param string The UTF-16 string
 * @param length The length of the UTF-16 string
 * @return An initialized OFString
 */
- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length;












/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string with
 *	  the specified length, assuming the specified byte order if no BOM is
 *	  found.
 *
 * @param string The UTF-16 string
 * @param length The length of the UTF-16 string







>
>
>
>
>
>
>
>











>
>
>
>
>
>
>
>
>
>
>







356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
 * @param byteOrder The byte order to assume if there is no BOM
 * @return An initialized OFString
 */
- initWithCharacters: (const of_unichar_t*)characters
	      length: (size_t)length
	   byteOrder: (of_byte_order_t)byteOrder;

/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string.
 *
 * @param string The UTF-16 string
 * @return An initialized OFString
 */
- initWithUTF16String: (const uint16_t*)string;

/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string with
 *	  the specified length.
 *
 * @param string The UTF-16 string
 * @param length The length of the UTF-16 string
 * @return An initialized OFString
 */
- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length;

/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string,
 *	  assuming the specified byte order if no BOM is found.
 *
 * @param string The UTF-16 string
 * @param byteOrder The byte order to assume if there is no BOM
 * @return An initialized OFString
 */
- initWithUTF16String: (const uint16_t*)string
	    byteOrder: (of_byte_order_t)byteOrder;

/*!
 * @brief Initializes an already allocated OFString with a UTF-16 string with
 *	  the specified length, assuming the specified byte order if no BOM is
 *	  found.
 *
 * @param string The UTF-16 string
 * @param length The length of the UTF-16 string
898
899
900
901
902
903
904

905
906
907
#endif

#ifdef __cplusplus
extern "C" {
#endif
extern size_t of_string_utf8_encode(of_unichar_t, char*);
extern size_t of_string_utf8_decode(const char*, size_t, of_unichar_t*);

#ifdef __cplusplus
}
#endif







>



936
937
938
939
940
941
942
943
944
945
946
#endif

#ifdef __cplusplus
extern "C" {
#endif
extern size_t of_string_utf8_encode(of_unichar_t, char*);
extern size_t of_string_utf8_decode(const char*, size_t, of_unichar_t*);
extern size_t of_string_utf16_length(const uint16_t*);
#ifdef __cplusplus
}
#endif

Modified src/OFString.m from [9b71ee7ecc] to [6044846057].

137
138
139
140
141
142
143











144
145
146
147
148
149
150
		*ret = ((buffer[0] & 0x07) << 18) | ((buffer[1] & 0x3F) << 12) |
		    ((buffer[2] & 0x3F) << 6) | (buffer[3] & 0x3F);
		return 4;
	}

	return 0;
}












static OFString*
standardize_path(OFArray *components, OFString *currentDirectory,
    OFString *parentDirectory, OFString *joinString)
{
	void *pool = objc_autoreleasePoolPush();
	OFMutableArray *array;







>
>
>
>
>
>
>
>
>
>
>







137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
		*ret = ((buffer[0] & 0x07) << 18) | ((buffer[1] & 0x3F) << 12) |
		    ((buffer[2] & 0x3F) << 6) | (buffer[3] & 0x3F);
		return 4;
	}

	return 0;
}

size_t
of_string_utf16_length(const uint16_t *string)
{
	size_t length = 0;

	while (*string++ != 0)
		length++;

	return length;
}

static OFString*
standardize_path(OFArray *components, OFString *currentDirectory,
    OFString *parentDirectory, OFString *joinString)
{
	void *pool = objc_autoreleasePoolPush();
	OFMutableArray *array;
296
297
298
299
300
301
302





303
304
305
306
307
308
309







310
311
312
313
314
315
316
	      length: (size_t)length
	   byteOrder: (of_byte_order_t)byteOrder
{
	return (id)[[OFString_UTF8 alloc] initWithCharacters: string
						      length: length
						   byteOrder: byteOrder];
}






- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string
						       length: length];
}








- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
	    byteOrder: (of_byte_order_t)byteOrder
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string
						       length: length







>
>
>
>
>







>
>
>
>
>
>
>







307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
	      length: (size_t)length
	   byteOrder: (of_byte_order_t)byteOrder
{
	return (id)[[OFString_UTF8 alloc] initWithCharacters: string
						      length: length
						   byteOrder: byteOrder];
}

- initWithUTF16String: (const uint16_t*)string
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string];
}

- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string
						       length: length];
}

- initWithUTF16String: (const uint16_t*)string
	    byteOrder: (of_byte_order_t)byteOrder
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string
						    byteOrder: byteOrder];
}

- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
	    byteOrder: (of_byte_order_t)byteOrder
{
	return (id)[[OFString_UTF8 alloc] initWithUTF16String: string
						       length: length
483
484
485
486
487
488
489





490
491
492
493
494
495
496







497
498
499
500
501
502
503
			      length: (size_t)length
			   byteOrder: (of_byte_order_t)byteOrder
{
	return [[[self alloc] initWithCharacters: string
					  length: length
				       byteOrder: byteOrder] autorelease];
}






+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length
{
	return [[[self alloc] initWithUTF16String: string
					   length: length] autorelease];
}








+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length
			    byteOrder: (of_byte_order_t)byteOrder
{
	return [[[self alloc] initWithUTF16String: string
					   length: length







>
>
>
>
>







>
>
>
>
>
>
>







506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
			      length: (size_t)length
			   byteOrder: (of_byte_order_t)byteOrder
{
	return [[[self alloc] initWithCharacters: string
					  length: length
				       byteOrder: byteOrder] autorelease];
}

+ (instancetype)stringWithUTF16String: (const uint16_t*)string
{
	return [[[self alloc] initWithUTF16String: string] autorelease];
}

+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length
{
	return [[[self alloc] initWithUTF16String: string
					   length: length] autorelease];
}

+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			    byteOrder: (of_byte_order_t)byteOrder
{
	return [[[self alloc] initWithUTF16String: string
					byteOrder: byteOrder] autorelease];
}

+ (instancetype)stringWithUTF16String: (const uint16_t*)string
			       length: (size_t)length
			    byteOrder: (of_byte_order_t)byteOrder
{
	return [[[self alloc] initWithUTF16String: string
					   length: length
638
639
640
641
642
643
644







645
646
647
648
649
650
651
652








653
654
655
656
657
658
659
		[self doesNotRecognizeSelector: _cmd];
		abort();
	} @catch (id e) {
		[self release];
		@throw e;
	}
}








- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
{
	return [self initWithUTF16String: string
				  length: length
			       byteOrder: OF_BYTE_ORDER_NATIVE];
}









- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
	    byteOrder: (of_byte_order_t)byteOrder
{
	@try {
		[self doesNotRecognizeSelector: _cmd];







>
>
>
>
>
>
>








>
>
>
>
>
>
>
>







673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
		[self doesNotRecognizeSelector: _cmd];
		abort();
	} @catch (id e) {
		[self release];
		@throw e;
	}
}

- initWithUTF16String: (const uint16_t*)string
{
	return [self initWithUTF16String: string
				  length: of_string_utf16_length(string)
			       byteOrder: OF_BYTE_ORDER_NATIVE];
}

- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
{
	return [self initWithUTF16String: string
				  length: length
			       byteOrder: OF_BYTE_ORDER_NATIVE];
}

- initWithUTF16String: (const uint16_t*)string
	    byteOrder: (of_byte_order_t)byteOrder
{
	return [self initWithUTF16String: string
				  length: of_string_utf16_length(string)
			       byteOrder: byteOrder];
}

- initWithUTF16String: (const uint16_t*)string
	       length: (size_t)length
	    byteOrder: (of_byte_order_t)byteOrder
{
	@try {
		[self doesNotRecognizeSelector: _cmd];
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
	size_t length = [self length];
	uint16_t *ret;
	size_t i, j;
	BOOL swap = (byteOrder != OF_BYTE_ORDER_NATIVE);

	/* Allocate memory for the worst case */
	ret = [object allocMemoryWithSize: sizeof(uint16_t)
				    count: length * 2];

	j = 0;

	for (i = 0; i < length; i++) {
		of_unichar_t c = characters[i];

		if (c > 0x10FFFF)







|







2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
	size_t length = [self length];
	uint16_t *ret;
	size_t i, j;
	BOOL swap = (byteOrder != OF_BYTE_ORDER_NATIVE);

	/* Allocate memory for the worst case */
	ret = [object allocMemoryWithSize: sizeof(uint16_t)
				    count: (length + 1) * 2];

	j = 0;

	for (i = 0; i < length; i++) {
		of_unichar_t c = characters[i];

		if (c > 0x10FFFF)
2036
2037
2038
2039
2040
2041
2042

2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
				c -= 0x10000;
				ret[j++] = 0xD800 | (c >> 10);
				ret[j++] = 0xDC00 | (c & 0x3FF);
			} else
				ret[j++] = c;
		}
	}


	@try {
		ret = [object resizeMemory: ret
				      size: sizeof(uint16_t)
				     count: j];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't care, as we only tried to make it smaller */
	}

	objc_autoreleasePoolPop(pool);

	return ret;







>




|







2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
				c -= 0x10000;
				ret[j++] = 0xD800 | (c >> 10);
				ret[j++] = 0xDC00 | (c & 0x3FF);
			} else
				ret[j++] = c;
		}
	}
	ret[j] = 0;

	@try {
		ret = [object resizeMemory: ret
				      size: sizeof(uint16_t)
				     count: j + 1];
	} @catch (OFOutOfMemoryException *e) {
		/* We don't care, as we only tried to make it smaller */
	}

	objc_autoreleasePoolPop(pool);

	return ret;

Modified tests/OFStringTests.m from [101fcea2a5] to [7f9159fd57].

43
44
45
46
47
48
49
50
51
52
53

54
55
56
57
58
59
60
	0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0x1F03A
};
static of_unichar_t sucstr[] = {
	0xFFFE0000, 0x66000000, 0xF6000000, 0xF6000000, 0x62000000, 0xE4000000,
	0x72000000, 0x3AF00100
};
static uint16_t utf16str[] = {
	0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0xD83C, 0xDC3A
};
static uint16_t sutf16str[] = {
	0xFFFE, 0x6600, 0xF600, 0xF600, 0x6200, 0xE400, 0x7200, 0x3CD8, 0x3ADC

};

@interface EntityHandler: OFObject <OFStringXMLUnescapingDelegate>
@end

@implementation EntityHandler
-	   (OFString*)string: (OFString*)string







|


|
>







43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
	0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0x1F03A
};
static of_unichar_t sucstr[] = {
	0xFFFE0000, 0x66000000, 0xF6000000, 0xF6000000, 0x62000000, 0xE4000000,
	0x72000000, 0x3AF00100
};
static uint16_t utf16str[] = {
	0xFEFF, 'f', 0xF6, 0xF6, 'b', 0xE4, 'r', 0xD83C, 0xDC3A, 0
};
static uint16_t sutf16str[] = {
	0xFFFE, 0x6600, 0xF600, 0xF600, 0x6200, 0xE400, 0x7200, 0x3CD8, 0x3ADC,
	0
};

@interface EntityHandler: OFObject <OFStringXMLUnescapingDelegate>
@end

@implementation EntityHandler
-	   (OFString*)string: (OFString*)string
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
						  sizeof(*ucstr)]) &&
	    [is isEqual: @"fööbär🀺"] &&
	    (is = [OFString stringWithCharacters: sucstr
					  length: sizeof(sucstr) /
						  sizeof(*sucstr)]) &&
	    [is isEqual: @"fööbär🀺"])

	TEST(@"+[stringWithUTF16String:length:]",
	    (is = [OFString stringWithUTF16String: utf16str
					   length: sizeof(utf16str) /
						   sizeof(*utf16str)]) &&
	    [is isEqual: @"fööbär🀺"] &&
	    (is = [OFString stringWithUTF16String: sutf16str
					   length: sizeof(sutf16str) /
						   sizeof(*sutf16str)]) &&
	    [is isEqual: @"fööbär🀺"])

	TEST(@"+[stringWithContentsOfFile:encoding]", (is = [OFString
	    stringWithContentsOfFile: @"testfile.txt"
			    encoding: OF_STRING_ENCODING_ISO_8859_1]) &&
	    [is isEqual: @"testäöü"])








|
|
<
<

|
<
<







163
164
165
166
167
168
169
170
171


172
173


174
175
176
177
178
179
180
						  sizeof(*ucstr)]) &&
	    [is isEqual: @"fööbär🀺"] &&
	    (is = [OFString stringWithCharacters: sucstr
					  length: sizeof(sucstr) /
						  sizeof(*sucstr)]) &&
	    [is isEqual: @"fööbär🀺"])

	TEST(@"+[stringWithUTF16String:]",
	    (is = [OFString stringWithUTF16String: utf16str]) &&


	    [is isEqual: @"fööbär🀺"] &&
	    (is = [OFString stringWithUTF16String: sutf16str]) &&


	    [is isEqual: @"fööbär🀺"])

	TEST(@"+[stringWithContentsOfFile:encoding]", (is = [OFString
	    stringWithContentsOfFile: @"testfile.txt"
			    encoding: OF_STRING_ENCODING_ISO_8859_1]) &&
	    [is isEqual: @"testäöü"])