ObjFW  Check-in [0480a27d5e]

Overview
Comment:Full Unicode support for OFMutableString's -[upper] and -[lower].
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 0480a27d5eafbf913663519293f3b289cefd26cbad6c2031c5dad6cda7665e3a
User & Date: js on 2009-10-15 20:38:14
Other Links: manifest | tags
Context
2009-10-16
08:44
OFMutableString's -[upper] and -[lower]: Get rid of code duplication. check-in: 8389241a05 user: js tags: trunk
2009-10-15
20:38
Full Unicode support for OFMutableString's -[upper] and -[lower]. check-in: 0480a27d5e user: js tags: trunk
2009-10-12
16:57
Add generated Unicode tables. check-in: 0c8ad4fef3 user: js tags: trunk
Changes

Modified src/OFFile.m from [eabe2a45bb] to [12d8e10088].

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
	fp = fp_;

	return self;
}

- (void)dealloc
{
	if (close == YES && fp != NULL)
		fclose(fp);

	[super dealloc];
}

- (BOOL)atEndOfStream
{







|







159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
	fp = fp_;

	return self;
}

- (void)dealloc
{
	if (close && fp != NULL)
		fclose(fp);

	[super dealloc];
}

- (BOOL)atEndOfStream
{

Modified src/OFMutableString.m from [693bddf4d9] to [faf789896c].

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31



32
33
34
35
36
37
38
#include "config.h"

#define _GNU_SOURCE
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#ifdef HAVE_MADVISE
#include <sys/mman.h>
#else
#define madvise(addr, len, advise)
#endif

#import "OFMutableString.h"
#import "OFExceptions.h"
#import "OFMacros.h"

#import "asprintf.h"




@implementation OFMutableString
- setToCString: (const char*)str
{
	size_t len;

	if (string != NULL)







|












>
>
>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include "config.h"

#define _GNU_SOURCE
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#ifdef HAVE_MADVISE
#include <sys/mman.h>
#else
#define madvise(addr, len, advise)
#endif

#import "OFMutableString.h"
#import "OFExceptions.h"
#import "OFMacros.h"

#import "asprintf.h"

extern const of_unichar_t* const of_unicode_upper_table[0x1100];
extern const of_unichar_t* const of_unicode_lower_table[0x1100];

@implementation OFMutableString
- setToCString: (const char*)str
{
	size_t len;

	if (string != NULL)
272
273
274
275
276
277
278




279
280
281
























282
283
284




285



















































286
287
288
289
290
291




292
293
294
























295
296
297




298



















































299
300
301
302
303
304
305
	madvise(string, len, MADV_NORMAL);

	return self;
}

- upper
{




	char *p = string + length;

	if (is_utf8)
























		@throw [OFInvalidEncodingException newWithClass: isa];

	while (--p >= string)




		*p = toupper((int)*p);




















































	return self;
}

- lower
{




	char *p = string + length;

	if (is_utf8)
























		@throw [OFInvalidEncodingException newWithClass: isa];

	while (--p >= string)




		*p = tolower((int)*p);




















































	return self;
}

- removeCharactersFromIndex: (size_t)start
		    toIndex: (size_t)end
{







>
>
>
>
|

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






>
>
>
>
|

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
	madvise(string, len, MADV_NORMAL);

	return self;
}

- upper
{
	of_unichar_t c, uc;
	of_unichar_t *ustr;
	size_t ulen, nlen;
	size_t i, j, d;
	char *nstr;

	if (!is_utf8) {
		uint8_t *p = (uint8_t*)string + length;
		uint8_t t;

		while (--p >= (uint8_t*)string) {
			t = of_unicode_upper_table[0][*p];
			if (t != 0)
				*p = t;
		}

		return self;
	}

	ulen = [self length];
	ustr = [self allocMemoryForNItems: [self length]
				 withSize: ulen];

	j = 0;
	nlen = 0;

	for (i = 0; i < length; i++) {
		c = of_string_utf8_to_unicode(string + i, length - i);

		if (c == OF_INVALID_UNICHAR || c > 0x10FFFF) {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		uc = of_unicode_upper_table[c >> 8][c & 0xFF];
		if (uc == 0)
			uc = c;
		ustr[j++] = uc;

		if (uc < 0x80)
			nlen++;
		else if (uc < 0x800)
			nlen += 2;
		else if (uc < 0x10000)
			nlen += 3;
		else if (uc < 0x110000)
			nlen += 4;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c < 0x80);
		else if (c < 0x800)
			i++;
		else if (c < 0x10000)
			i += 2;
		else if (c < 0x110000)
			i += 3;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
	}

	@try {
		nstr = [self allocMemoryWithSize: nlen + 1];
	} @catch (OFException *e) {
		[self freeMemory: ustr];
		@throw e;
	}

	j = 0;

	for (i = 0; i < ulen; i++) {
		if ((d = of_string_unicode_to_utf8(ustr[i], nstr + j)) == 0) {
			[self freeMemory: ustr];
			[self freeMemory: nstr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
		j += d;
	}

	assert(j == nlen);
	nstr[j] = 0;
	[self freeMemory: ustr];

	[self freeMemory: string];
	string = nstr;
	length = nlen;

	return self;
}

- lower
{
	of_unichar_t c, lc;
	of_unichar_t *ustr;
	size_t ulen, nlen;
	size_t i, j, d;
	char *nstr;

	if (!is_utf8) {
		uint8_t *p = (uint8_t*)string + length;
		uint8_t t;

		while (--p >= (uint8_t*)string) {
			t = of_unicode_lower_table[0][*p];
			if (t != 0)
				*p = t;
		}

		return self;
	}

	ulen = [self length];
	ustr = [self allocMemoryForNItems: [self length]
				 withSize: ulen];

	j = 0;
	nlen = 0;

	for (i = 0; i < length; i++) {
		c = of_string_utf8_to_unicode(string + i, length - i);

		if (c == OF_INVALID_UNICHAR || c > 0x10FFFF) {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		lc = of_unicode_lower_table[c >> 8][c & 0xFF];
		if (lc == 0)
			lc = c;
		ustr[j++] = lc;

		if (lc < 0x80)
			nlen++;
		else if (lc < 0x800)
			nlen += 2;
		else if (lc < 0x10000)
			nlen += 3;
		else if (lc < 0x110000)
			nlen += 4;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c < 0x80);
		else if (c < 0x800)
			i++;
		else if (c < 0x10000)
			i += 2;
		else if (c < 0x110000)
			i += 3;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
	}

	@try {
		nstr = [self allocMemoryWithSize: nlen + 1];
	} @catch (OFException *e) {
		[self freeMemory: ustr];
		@throw e;
	}

	j = 0;

	for (i = 0; i < ulen; i++) {
		if ((d = of_string_unicode_to_utf8(ustr[i], nstr + j)) == 0) {
			[self freeMemory: ustr];
			[self freeMemory: nstr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
		j += d;
	}

	assert(j == nlen);
	nstr[j] = 0;
	[self freeMemory: ustr];

	[self freeMemory: string];
	string = nstr;
	length = nlen;

	return self;
}

- removeCharactersFromIndex: (size_t)start
		    toIndex: (size_t)end
{

Modified src/OFString.m from [2ba568e7fc] to [0b43a03a38].

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

158

159
160
161
162
163

164

165
166
167
168
169
170
171
}

of_unichar_t
of_string_utf8_to_unicode(const char *buf_, size_t len)
{
	const uint8_t *buf = (const uint8_t*)buf_;

	if (*buf < 0x80)
		return buf[0];

	switch (*buf & 0xF0) {
	case 0xC0:
	case 0xD0:
		if (OF_UNLIKELY(len < 2))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);

	case 0xE0:

		if (OF_UNLIKELY(len < 3))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		    (buf[2] & 0x3F);

	case 0xF0:

		if (OF_UNLIKELY(len < 4))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		    ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F);
	}








|


|
<
<




>
|
>





>
|
>







141
142
143
144
145
146
147
148
149
150
151


152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
}

of_unichar_t
of_string_utf8_to_unicode(const char *buf_, size_t len)
{
	const uint8_t *buf = (const uint8_t*)buf_;

	if (!(*buf & 0x80))
		return buf[0];

	if ((*buf & 0xE0) == 0xC0) {


		if (OF_UNLIKELY(len < 2))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);
	}

	if ((*buf & 0xF0) == 0xE0) {
		if (OF_UNLIKELY(len < 3))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		    (buf[2] & 0x3F);
	}

	if ((*buf & 0xF8) == 0xF0) {
		if (OF_UNLIKELY(len < 4))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		    ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F);
	}

323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
		string[length] = 0;

		break;
	case OF_STRING_ENCODING_ISO_8859_1:
	case OF_STRING_ENCODING_ISO_8859_15:
	case OF_STRING_ENCODING_WINDOWS_1252:
		for (i = j = 0; i < len; i++) {
			if ((uint8_t)str[i] < 0x80)
				string[j++] = str[i];
			else {
				char buf[4];
				of_unichar_t chr;
				size_t chr_bytes;

				switch (encoding) {







|







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
		string[length] = 0;

		break;
	case OF_STRING_ENCODING_ISO_8859_1:
	case OF_STRING_ENCODING_ISO_8859_15:
	case OF_STRING_ENCODING_WINDOWS_1252:
		for (i = j = 0; i < len; i++) {
			if (!(str[i] & 0x80))
				string[j++] = str[i];
			else {
				char buf[4];
				of_unichar_t chr;
				size_t chr_bytes;

				switch (encoding) {

Modified tests/string.m from [9be34bcdeb] to [093f48c183].

73
74
75
76
77
78
79
80

81


82

83
84
85
86
87
88
89
	    [s[0] characterAtIndex: 5] == 0x1D11E)

	EXPECT_EXCEPTION(@"Detect out of range in -[characterAtIndex:]",
	    OFOutOfRangeException, [s[0] characterAtIndex: 7])

	TEST(@"-[reverse]", [[s[0] reverse] isEqual: @"3𝄞1€sät"])

	s[0] = [OFMutableString stringWithString: @"321tset"];

	TEST(@"-[upper]", [[s[0] upper] isEqual: @"321TSET"])


	TEST(@"-[lower]", [[s[0] lower] isEqual: @"321tset"])


	TEST(@"+[stringWithCString:length:]",
	    (s[0] = [OFMutableString stringWithCString: "foobar"
					      length: 3]) &&
	    [s[0] isEqual: @"foo"])

	TEST(@"-[appendCStringWithLength:]",







|
>
|
>
>
|
>







73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
	    [s[0] characterAtIndex: 5] == 0x1D11E)

	EXPECT_EXCEPTION(@"Detect out of range in -[characterAtIndex:]",
	    OFOutOfRangeException, [s[0] characterAtIndex: 7])

	TEST(@"-[reverse]", [[s[0] reverse] isEqual: @"3𝄞1€sät"])

	s[1] = [OFMutableString stringWithString: @"abc"];

	TEST(@"-[upper]", [[s[0] upper] isEqual: @"3𝄞1€SÄT"] &&
	    [[s[1] upper] isEqual: @"ABC"])

	TEST(@"-[lower]", [[s[0] lower] isEqual: @"3𝄞1€sät"] &&
	    [[s[1] lower] isEqual: @"abc"])

	TEST(@"+[stringWithCString:length:]",
	    (s[0] = [OFMutableString stringWithCString: "foobar"
					      length: 3]) &&
	    [s[0] isEqual: @"foo"])

	TEST(@"-[appendCStringWithLength:]",