ObjFW  Check-in [3d007c8393]

Overview
Comment:Improve of_string_utf8_to_unicode.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 3d007c839338fafc351f5f4948217dce3ef33ef8cd771cdd03f8dec1bfea533b
User & Date: js on 2009-11-10 15:34:39
Other Links: manifest | tags
Context
2009-11-10
20:13
Fix a typo in TableGenerator.m and the resulting unicode.h. check-in: c628317621 user: js tags: trunk
15:34
Improve of_string_utf8_to_unicode. check-in: 3d007c8393 user: js tags: trunk
15:32
Improve -[compare:]. check-in: 6772512e3e user: js tags: trunk
Changes

Modified src/OFMutableString.m from [be87eedb20] to [ae31141f77].

33
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

61
62
63
64
65


66
67

68
69
70
71
72
73
74
33
34
35
36
37
38
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


65
66
67

68
69
70
71
72
73
74
75







-
+




















+



-
-
+
+

-
+








static void
apply_table(id self, Class isa, char **string, unsigned int *length,
    BOOL is_utf8, const of_unichar_t* const table[], const size_t table_size)
{
	of_unichar_t c, tc;
	of_unichar_t *ustr;
	size_t ulen, nlen;
	size_t ulen, nlen, clen;
	size_t i, j, d;
	char *nstr;

	if (!is_utf8) {
		assert(table_size >= 1);

		uint8_t *p = (uint8_t*)*string + *length;
		uint8_t t;

		while (--p >= (uint8_t*)*string)
			if ((t = table[0][*p]) != 0)
				*p = t;

		return;
	}

	ulen = [self length];
	ustr = [self allocMemoryForNItems: [self length]
				 withSize: ulen];

	i = 0;
	j = 0;
	nlen = 0;

	for (i = 0; i < *length; i++) {
		c = of_string_utf8_to_unicode(*string + i, *length - i);
	while (i < *length) {
		clen = of_string_utf8_to_unicode(*string + i, *length - i, &c);

		if (c == OF_INVALID_UNICHAR || c > 0x10FFFF) {
		if (clen == 0 || c > 0x10FFFF) {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c >> 8 < table_size) {
			if ((tc = table[c >> 8][c & 0xFF]) == 0)
				tc = c;
85
86
87
88
89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
104
105
106
107
108
109
86
87
88
89
90
91
92





93






94
95
96
97
98
99
100







-
-
-
-
-
+
-
-
-
-
-
-







		else if (tc < 0x110000)
			nlen += 4;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}

		if (c < 0x80);
		else if (c < 0x800)
			i++;
		else if (c < 0x10000)
			i += 2;
		i += clen;
		else if (c < 0x110000)
			i += 3;
		else {
			[self freeMemory: ustr];
			@throw [OFInvalidEncodingException newWithClass: isa];
		}
	}

	@try {
		nstr = [self allocMemoryWithSize: nlen + 1];
	} @catch (OFException *e) {
		[self freeMemory: ustr];
		@throw e;

Modified src/OFString.h from [1b35f65444] to [a8cd4bc958].

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37
38
11
12
13
14
15
16
17


18
19
20
21
22
23
24
25
26
27
28

29
30
31
32
33
34
35
36







-
-











-
+








#include <stdio.h>
#include <stdarg.h>

#import "OFObject.h"
#import "OFArray.h"

#define OF_INVALID_UNICHAR UINT32_MAX

typedef uint32_t of_unichar_t;

enum of_string_encoding {
	OF_STRING_ENCODING_UTF_8,
	OF_STRING_ENCODING_ISO_8859_1,
	OF_STRING_ENCODING_ISO_8859_15,
	OF_STRING_ENCODING_WINDOWS_1252
};

extern int of_string_check_utf8(const char*, size_t);
extern size_t of_string_unicode_to_utf8(of_unichar_t, char*);
extern of_unichar_t of_string_utf8_to_unicode(const char*, size_t);
extern size_t of_string_utf8_to_unicode(const char*, size_t, of_unichar_t*);
extern size_t of_string_position_to_index(const char*, size_t);
extern size_t of_string_index_to_position(const char*, size_t, size_t);

/**
 * A class for managing strings.
 */
@interface OFString: OFObject <OFCopying, OFMutableCopying>

Modified src/OFString.m from [c3f91002a4] to [b097e022e5].

136
137
138
139
140
141
142
143
144


145
146
147
148
149




150
151
152
153

154
155


156
157
158
159
160

161
162

163

164
165
166
167
168

169
170

171

172
173
174

175
176
177
178
179
180
181
136
137
138
139
140
141
142


143
144
145
146
147


148
149
150
151
152
153
154

155
156

157
158
159
160
161
162

163
164

165
166
167
168
169
170
171

172
173

174
175
176
177
178

179
180
181
182
183
184
185
186







-
-
+
+



-
-
+
+
+
+



-
+

-
+
+




-
+

-
+

+




-
+

-
+

+


-
+







		buf[i] = 0x80 | (c & 0x3F);
		return 4;
	}

	return 0;
}

of_unichar_t
of_string_utf8_to_unicode(const char *buf_, size_t len)
size_t
of_string_utf8_to_unicode(const char *buf_, size_t len, of_unichar_t *ret)
{
	const uint8_t *buf = (const uint8_t*)buf_;

	if (!(*buf & 0x80))
		return buf[0];
	if (!(*buf & 0x80)) {
		*ret = buf[0];
		return 1;
	}

	if ((*buf & 0xE0) == 0xC0) {
		if (OF_UNLIKELY(len < 2))
			return OF_INVALID_UNICHAR;
			return 0;

		return ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);
		*ret = ((buf[0] & 0x1F) << 6) | (buf[1] & 0x3F);
		return 2;
	}

	if ((*buf & 0xF0) == 0xE0) {
		if (OF_UNLIKELY(len < 3))
			return OF_INVALID_UNICHAR;
			return 0;

		return ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		*ret = ((buf[0] & 0x0F) << 12) | ((buf[1] & 0x3F) << 6) |
		    (buf[2] & 0x3F);
		return 3;
	}

	if ((*buf & 0xF8) == 0xF0) {
		if (OF_UNLIKELY(len < 4))
			return OF_INVALID_UNICHAR;
			return 0;

		return ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		*ret = ((buf[0] & 0x07) << 18) | ((buf[1] & 0x3F) << 12) |
		    ((buf[2] & 0x3F) << 6) | (buf[3] & 0x3F);
		return 4;
	}

	return OF_INVALID_UNICHAR;
	return 0;
}

size_t
of_string_position_to_index(const char *str, size_t pos)
{
	size_t i, idx = pos;

611
612
613
614
615
616
617
618

619
620
621
622
623
624
625
626
616
617
618
619
620
621
622

623

624
625
626
627
628
629
630







-
+
-







	of_unichar_t c;

	index = of_string_index_to_position(string, index, length);

	if (index >= length)
		@throw [OFOutOfRangeException newWithClass: isa];

	if ((c = of_string_utf8_to_unicode(string + index, length - index)) ==
	if (!of_string_utf8_to_unicode(string + index, length - index, &c))
	    OF_INVALID_UNICHAR)
		@throw [OFInvalidEncodingException newWithClass: isa];

	return c;
}

- (size_t)indexOfFirstOccurrenceOfString: (OFString*)str
{