ObjFW  Check-in [566d4df603]

Overview
Comment:OFStdIOStream_Win32Console: Improve reading

On reads, surrogates that have been cut off are now properly handled.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 566d4df60303cdfcb2f21ada37ffd196efd48c28a368b4bb90cf1e3d6b29d5ea
User & Date: js on 2016-03-13 10:24:22
Other Links: manifest | tags
Context
2016-03-13
11:19
Change of_string_utf8_decode() API check-in: 5e2ef97c35 user: js tags: trunk
10:24
OFStdIOStream_Win32Console: Improve reading check-in: 566d4df603 user: js tags: trunk
2016-03-12
21:19
Work around Doxygen shortcomings check-in: 2425966b66 user: js tags: trunk
Changes

Modified src/OFStdIOStream_Win32Console.h from [5329f7273d] to [34f51c6008].

19
20
21
22
23
24
25

26
27
28
29
#import "OFStdIOStream.h"

OF_ASSUME_NONNULL_BEGIN

@interface OFStdIOStream_Win32Console: OFStdIOStream
{
	HANDLE _handle;

}
@end

OF_ASSUME_NONNULL_END







>




19
20
21
22
23
24
25
26
27
28
29
30
#import "OFStdIOStream.h"

OF_ASSUME_NONNULL_BEGIN

@interface OFStdIOStream_Win32Console: OFStdIOStream
{
	HANDLE _handle;
	of_char16_t _incompleteSurrogate;
}
@end

OF_ASSUME_NONNULL_END

Modified src/OFStdIOStream_Win32Console.m from [7d8639b924] to [88dea4ced5].

14
15
16
17
18
19
20
21
22
23
24
25
26

27
28
29
30
31
32

33
34

35
36
37
38
39
40
41
42
43
44
 * file.
 */

/*
 * This file tries to make writing UTF-8 strings to the console "just work" on
 * Windows.
 *
 * Windows does provide a way to change the codepage of the console to UTF-8,
 * but unfortunately, different Windows versions handle that differently. For
 * example on Windows XP when using Windows XP's console, changing the codepage
 * to UTF-8 mostly breaks write() and completely breaks read(): write()
 * suddenly returns the number of characters - instead of bytes - written and
 * read() just returns 0 as soon as a Unicode character is being read.

 *
 * So instead of just using the UTF-8 codepage, this captures all reads and
 * writes to of_std{in,err,out} on the lowlevel, interprets the buffer as UTF-8
 * and converts to / from UTF-16 to use ReadConsoleW() and WriteConsoleW(), as
 * reading or writing binary from / to the console would not make any sense
 * anyway and thus it's safe to assume it's text.

 *
 * In order to not do this when redirecting input / output to a file, it checks

 * that the handle is indeed a console.
 *
 * TODO: Properly handle surrogates being cut in the middle
 */

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"







|
|
|
|
|
|
>

|
|
|
<
|
>

|
>
|
<
<







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37


38
39
40
41
42
43
44
 * file.
 */

/*
 * This file tries to make writing UTF-8 strings to the console "just work" on
 * Windows.
 *
 * While Windows does provide a way to change the codepage of the console to
 * UTF-8, unfortunately, different Windows versions handle that differently.
 * For example, on Windows XP, when using Windows XP's console, changing the
 * codepage to UTF-8 mostly breaks write() and completely breaks read():
 * write() suddenly returns the number of characters - instead of bytes -
 * written and read() just returns 0 as soon as a Unicode character is being
 * read.
 *
 * Therefore, instead of just using the UTF-8 codepage, this captures all reads
 * and writes to of_std{in,out,err} on the lowlevel, interprets the buffer as
 * UTF-8 and converts to / from UTF-16 to use ReadConsoleW() / WriteConsoleW().

 * Doing so is safe, as the console only supports text anyway and thus it does
 * not matter if binary gets garbled by the conversion.
 *
 * In order to not do this when redirecting input / output to a file (as the
 * file would then be read / written in the wrong encoding and break reading /
 * writing binary), it checks that the handle is indeed a console.


 */

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"
99
100
101
102
103
104
105
106
107
108
109
110
111

112
113
114
115
116
























117
118
119
120
121
122
123
124
125
126
127
128
129
130

131



132








133
134
135
136
137
138
139
- (size_t)lowlevelReadIntoBuffer: (void*)buffer_
			  length: (size_t)length
{
	void *pool = objc_autoreleasePoolPush();
	char *buffer = buffer_;
	of_char16_t *UTF16;
	size_t j = 0;
	OFDataArray *rest = nil;

	UTF16 = [self allocMemoryWithSize: sizeof(of_char16_t)
				    count: length];
	@try {
		DWORD UTF16Len;


		if (!ReadConsoleW(_handle, UTF16, length, &UTF16Len, NULL))
			@throw [OFReadFailedException
			    exceptionWithObject: self
				requestedLength: length];

























		for (size_t i = 0; i < UTF16Len; i++) {
			of_unichar_t c = UTF16[i];
			char UTF8[4];
			size_t UTF8Len;

			/* Missing high surrogate */
			if ((c & 0xFC00) == 0xDC00)
				@throw [OFInvalidEncodingException exception];

			if ((c & 0xFC00) == 0xD800) {
				of_char16_t next;

				if (UTF16Len <= i + 1)

					@throw [OFInvalidEncodingException



					    exception];









				next = UTF16[i + 1];

				if ((next & 0xFC00) != 0xDC00)
					@throw [OFInvalidEncodingException
					    exception];








<





>





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>













|
>
|
>
>
>
|
>
>
>
>
>
>
>
>







99
100
101
102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
- (size_t)lowlevelReadIntoBuffer: (void*)buffer_
			  length: (size_t)length
{
	void *pool = objc_autoreleasePoolPush();
	char *buffer = buffer_;
	of_char16_t *UTF16;
	size_t j = 0;


	UTF16 = [self allocMemoryWithSize: sizeof(of_char16_t)
				    count: length];
	@try {
		DWORD UTF16Len;
		OFDataArray *rest = nil;

		if (!ReadConsoleW(_handle, UTF16, length, &UTF16Len, NULL))
			@throw [OFReadFailedException
			    exceptionWithObject: self
				requestedLength: length];

		if (UTF16Len > 0 && _incompleteSurrogate != 0) {
			of_unichar_t c =
			    (((_incompleteSurrogate & 0x3FF) << 10) |
			    (UTF16[0] & 0x3FF)) + 0x10000;
			char UTF8[4];
			size_t UTF8Len;

			if ((UTF8Len = of_string_utf8_encode(c, UTF8)) == 0)
				@throw [OFInvalidEncodingException exception];

			if (UTF8Len <= length) {
				memcpy(buffer, UTF8, UTF8Len);
				j += UTF8Len;
			} else {
				if (rest == nil)
					rest = [OFDataArray dataArray];

				[rest addItems: UTF8
					 count: UTF8Len];
			}

			_incompleteSurrogate = 0;
		}

		for (size_t i = 0; i < UTF16Len; i++) {
			of_unichar_t c = UTF16[i];
			char UTF8[4];
			size_t UTF8Len;

			/* Missing high surrogate */
			if ((c & 0xFC00) == 0xDC00)
				@throw [OFInvalidEncodingException exception];

			if ((c & 0xFC00) == 0xD800) {
				of_char16_t next;

				if (UTF16Len <= i + 1) {
					_incompleteSurrogate = c;

					if (rest != nil) {
						char *items = [rest items];
						size_t count = [rest count];

						[self unreadFromBuffer: items
								length: count];
					}

					objc_autoreleasePoolPop(pool);

					return j;
				}

				next = UTF16[i + 1];

				if ((next & 0xFC00) != 0xDC00)
					@throw [OFInvalidEncodingException
					    exception];