ObjFW  Check-in [3d007c8393]

Overview
Comment:Improve of_string_utf8_to_unicode.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 3d007c839338fafc351f5f4948217dce3ef33ef8cd771cdd03f8dec1bfea533b
User & Date: js on 2009-11-10 15:34:39
Other Links: manifest | tags
Context
2009-11-10
20:13
Fix a typo in TableGenerator.m and the resulting unicode.h. check-in: c628317621 user: js tags: trunk
15:34
Improve of_string_utf8_to_unicode. check-in: 3d007c8393 user: js tags: trunk
15:32
Improve -[compare:]. check-in: 6772512e3e user: js tags: trunk
Changes

Modified src/OFMutableString.m from [be87eedb20] to [ae31141f77].

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74

static void
apply_table(id self, Class isa, char **string, unsigned int *length,
    BOOL is_utf8, const of_unichar_t* const table[], const size_t table_size)
{
	of_unichar_t c, tc;
	of_unichar_t *ustr;
	size_t ulen, nlen;
	size_t i, j, d;
	char *nstr;

	if (!is_utf8) {
		assert(table_size >= 1);

		uint8_t *p = (uint8_t*)*string + *length;
		uint8_t t;

		while (--p >= (uint8_t*)*string)
			if ((t = table[0][*p]) != 0)
				*p = t;

		return;
	}

	ulen = [self length];
	ustr = [self allocMemoryForNItems: [self length]
				 withSize: ulen];


	j = 0;
	nlen = 0;

	for (i = 0; i < *length; i++) {
		c = of_string_utf8_to_unicode(*string + i, *length - i);

		if (c == OF_INVALID_UNICHAR || c > 0x10FFFF) {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c >> 8 < table_size) {
			if ((tc = table[c >> 8][c & 0xFF]) == 0)
				tc = c;







|




















>



|
|

|







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

static void
apply_table(id self, Class isa, char **string, unsigned int *length,
    BOOL is_utf8, const of_unichar_t* const table[], const size_t table_size)
{
	of_unichar_t c, tc;
	of_unichar_t *ustr;
	size_t ulen, nlen, clen;
	size_t i, j, d;
	char *nstr;

	if (!is_utf8) {
		assert(table_size >= 1);

		uint8_t *p = (uint8_t*)*string + *length;
		uint8_t t;

		while (--p >= (uint8_t*)*string)
			if ((t = table[0][*p]) != 0)
				*p = t;

		return;
	}

	ulen = [self length];
	ustr = [self allocMemoryForNItems: [self length]
				 withSize: ulen];

	i = 0;
	j = 0;
	nlen = 0;

	while (i < *length) {
		clen = of_string_utf8_to_unicode(*string + i, *length - i, &c);

		if (clen == 0 || c > 0x10FFFF) {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c >> 8 < table_size) {
			if ((tc = table[c >> 8][c & 0xFF]) == 0)
				tc = c;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
		else if (tc < 0x110000)
			nlen += 4;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c < 0x80);
		else if (c < 0x800)
			i++;
		else if (c < 0x10000)
			i += 2;
		else if (c < 0x110000)
			i += 3;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
	}

	@try {
		nstr = [self allocMemoryWithSize: nlen + 1];
	} @catch (OFException *e) {
		[self freeMemory: ustr];
		@throw e;







<
<
<
<
|
<
<
<
<
<
<







86
87
88
89
90
91
92




93






94
95
96
97
98
99
100
		else if (tc < 0x110000)
			nlen += 4;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}





		i += clen;






	}

	@try {
		nstr = [self allocMemoryWithSize: nlen + 1];
	} @catch (OFException *e) {
		[self freeMemory: ustr];
		@throw e;

Modified src/OFString.h from [1b35f65444] to [a8cd4bc958].

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

#include <stdio.h>
#include <stdarg.h>

#import "OFObject.h"
#import "OFArray.h"

#define OF_INVALID_UNICHAR UINT32_MAX

typedef uint32_t of_unichar_t;

enum of_string_encoding {
	OF_STRING_ENCODING_UTF_8,
	OF_STRING_ENCODING_ISO_8859_1,
	OF_STRING_ENCODING_ISO_8859_15,
	OF_STRING_ENCODING_WINDOWS_1252
};

extern int of_string_check_utf8(const char*, size_t);
extern size_t of_string_unicode_to_utf8(of_unichar_t, char*);
extern of_unichar_t of_string_utf8_to_unicode(const char*, size_t);
extern size_t of_string_position_to_index(const char*, size_t);
extern size_t of_string_index_to_position(const char*, size_t, size_t);

/**
 * A class for managing strings.
 */
@interface OFString: OFObject <OFCopying, OFMutableCopying>







<
<











|







11
12
13
14
15
16
17


18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

#include <stdio.h>
#include <stdarg.h>

#import "OFObject.h"
#import "OFArray.h"



typedef uint32_t of_unichar_t;

enum of_string_encoding {
	OF_STRING_ENCODING_UTF_8,
	OF_STRING_ENCODING_ISO_8859_1,
	OF_STRING_ENCODING_ISO_8859_15,
	OF_STRING_ENCODING_WINDOWS_1252
};

extern int of_string_check_utf8(const char*, size_t);
extern size_t of_string_unicode_to_utf8(of_unichar_t, char*);
extern size_t of_string_utf8_to_unicode(const char*, size_t, of_unichar_t*);
extern size_t of_string_position_to_index(const char*, size_t);
extern size_t of_string_index_to_position(const char*, size_t, size_t);

/**
 * A class for managing strings.
 */
@interface OFString: OFObject <OFCopying, OFMutableCopying>

Modified src/OFString.m from [c3f91002a4] to [b097e022e5].

136
137
138
139
140
141
142
143
144
145
146
147
148

149

150
151
152
153
154
155

156
157
158
159
160
161
162
163

164
165
166
167
168
169
170
171

172
173
174
175
176
177
178
179
180
181
		buf[i] = 0x80 | (c & 0x3F);
		return 4;
	}

	return 0;
}

of_unichar_t
of_string_utf8_to_unicode(const char *buf_, size_t len)
{
	const uint8_t *buf = (const uint8_t*)buf_;

	if (!(*buf & 0x80))

		return buf[0];


	if ((*buf & 0xE0) == 0xC0) {
		if (OF_UNLIKELY(len < 2))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);

	}

	if ((*buf & 0xF0) == 0xE0) {
		if (OF_UNLIKELY(len < 3))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		    (buf[2] & 0x3F);

	}

	if ((*buf & 0xF8) == 0xF0) {
		if (OF_UNLIKELY(len < 4))
			return OF_INVALID_UNICHAR;

		return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		    ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F);

	}

	return OF_INVALID_UNICHAR;
}

size_t
of_string_position_to_index(const char *str, size_t pos)
{
	size_t i, idx = pos;








|
|



|
>
|
>



|

|
>




|

|

>




|

|

>


|







136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
		buf[i] = 0x80 | (c & 0x3F);
		return 4;
	}

	return 0;
}

size_t
of_string_utf8_to_unicode(const char *buf_, size_t len, of_unichar_t *ret)
{
	const uint8_t *buf = (const uint8_t*)buf_;

	if (!(*buf & 0x80)) {
		*ret = buf[0];
		return 1;
	}

	if ((*buf & 0xE0) == 0xC0) {
		if (OF_UNLIKELY(len < 2))
			return 0;

		*ret = ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);
		return 2;
	}

	if ((*buf & 0xF0) == 0xE0) {
		if (OF_UNLIKELY(len < 3))
			return 0;

		*ret = ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		    (buf[2] & 0x3F);
		return 3;
	}

	if ((*buf & 0xF8) == 0xF0) {
		if (OF_UNLIKELY(len < 4))
			return 0;

		*ret = ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		    ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F);
		return 4;
	}

	return 0;
}

size_t
of_string_position_to_index(const char *str, size_t pos)
{
	size_t i, idx = pos;

611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
	of_unichar_t c;

	index = of_string_index_to_position(string, index, length);

	if (index >= length)
		@throw [OFOutOfRangeException newWithClass: isa];

	if ((c = of_string_utf8_to_unicode(string + index, length - index)) ==
	    OF_INVALID_UNICHAR)
		@throw [OFInvalidEncodingException newWithClass: isa];

	return c;
}

- (size_t)indexOfFirstOccurrenceOfString: (OFString*)str
{







|
<







616
617
618
619
620
621
622
623

624
625
626
627
628
629
630
	of_unichar_t c;

	index = of_string_index_to_position(string, index, length);

	if (index >= length)
		@throw [OFOutOfRangeException newWithClass: isa];

	if (!of_string_utf8_to_unicode(string + index, length - index, &c))

		@throw [OFInvalidEncodingException newWithClass: isa];

	return c;
}

- (size_t)indexOfFirstOccurrenceOfString: (OFString*)str
{