ObjFW  Diff

Differences From Artifact [7d8639b924]:

  • File src/OFStdIOStream_Win32Console.m — part of check-in [3e1b6bccbc] at 2016-03-12 20:28:31 on branch trunk — Properly handle UTF-8 in Win32 console

    The previous way was to set the codepage to UTF-8, however, this does
    not work properly on some versions of Windows.

    Instead, this catches reads / writes on of_std* on the lowlevel,
    interprets it as UTF-8, converts it to / from UTF-16 and then uses
    ReadConsoleW() / WriteConsoleW().

    Surrogates being cut in the middle is not properly handled yet, this
    will be implemented in a follow up commit. (user: js, size: 5464) [annotate] [blame] [check-ins using]

To Artifact [88dea4ced5]:


14
15
16
17
18
19
20
21
22
23
24
25
26

27
28
29
30
31
32

33
34

35
36
37
38
39
40
41
42
43
44
 * file.
 */

/*
 * This file tries to make writing UTF-8 strings to the console "just work" on
 * Windows.
 *
 * Windows does provide a way to change the codepage of the console to UTF-8,
 * but unfortunately, different Windows versions handle that differently. For
 * example on Windows XP when using Windows XP's console, changing the codepage
 * to UTF-8 mostly breaks write() and completely breaks read(): write()
 * suddenly returns the number of characters - instead of bytes - written and
 * read() just returns 0 as soon as a Unicode character is being read.

 *
 * So instead of just using the UTF-8 codepage, this captures all reads and
 * writes to of_std{in,err,out} on the lowlevel, interprets the buffer as UTF-8
 * and converts to / from UTF-16 to use ReadConsoleW() and WriteConsoleW(), as
 * reading or writing binary from / to the console would not make any sense
 * anyway and thus it's safe to assume it's text.

 *
 * In order to not do this when redirecting input / output to a file, it checks

 * that the handle is indeed a console.
 *
 * TODO: Properly handle surrogates being cut in the middle
 */

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"







|
|
|
|
|
|
>

|
|
|
<
|
>

|
>
|
<
<







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37


38
39
40
41
42
43
44
 * file.
 */

/*
 * This file tries to make writing UTF-8 strings to the console "just work" on
 * Windows.
 *
 * While Windows does provide a way to change the codepage of the console to
 * UTF-8, unfortunately, different Windows versions handle that differently.
 * For example, on Windows XP, when using Windows XP's console, changing the
 * codepage to UTF-8 mostly breaks write() and completely breaks read():
 * write() suddenly returns the number of characters - instead of bytes -
 * written and read() just returns 0 as soon as a Unicode character is being
 * read.
 *
 * Therefore, instead of just using the UTF-8 codepage, this captures all reads
 * and writes to of_std{in,out,err} on the lowlevel, interprets the buffer as
 * UTF-8 and converts to / from UTF-16 to use ReadConsoleW() / WriteConsoleW().

 * Doing so is safe, as the console only supports text anyway and thus it does
 * not matter if binary gets garbled by the conversion.
 *
 * In order to not do this when redirecting input / output to a file (as the
 * file would then be read / written in the wrong encoding and break reading /
 * writing binary), it checks that the handle is indeed a console.


 */

#define OF_STDIO_STREAM_WIN32_CONSOLE_M

#include "config.h"

#import "OFStdIOStream_Win32Console.h"
99
100
101
102
103
104
105
106
107
108
109
110
111

112
113
114
115
116
























117
118
119
120
121
122
123
124
125
126
127
128
129
130

131



132








133
134
135
136
137
138
139
- (size_t)lowlevelReadIntoBuffer: (void*)buffer_
			  length: (size_t)length
{
	void *pool = objc_autoreleasePoolPush();
	char *buffer = buffer_;
	of_char16_t *UTF16;
	size_t j = 0;
	OFDataArray *rest = nil;

	UTF16 = [self allocMemoryWithSize: sizeof(of_char16_t)
				    count: length];
	@try {
		DWORD UTF16Len;


		if (!ReadConsoleW(_handle, UTF16, length, &UTF16Len, NULL))
			@throw [OFReadFailedException
			    exceptionWithObject: self
				requestedLength: length];

























		for (size_t i = 0; i < UTF16Len; i++) {
			of_unichar_t c = UTF16[i];
			char UTF8[4];
			size_t UTF8Len;

			/* Missing high surrogate */
			if ((c & 0xFC00) == 0xDC00)
				@throw [OFInvalidEncodingException exception];

			if ((c & 0xFC00) == 0xD800) {
				of_char16_t next;

				if (UTF16Len <= i + 1)

					@throw [OFInvalidEncodingException



					    exception];









				next = UTF16[i + 1];

				if ((next & 0xFC00) != 0xDC00)
					@throw [OFInvalidEncodingException
					    exception];








<





>





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>













|
>
|
>
>
>
|
>
>
>
>
>
>
>
>







99
100
101
102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
- (size_t)lowlevelReadIntoBuffer: (void*)buffer_
			  length: (size_t)length
{
	void *pool = objc_autoreleasePoolPush();
	char *buffer = buffer_;
	of_char16_t *UTF16;
	size_t j = 0;


	UTF16 = [self allocMemoryWithSize: sizeof(of_char16_t)
				    count: length];
	@try {
		DWORD UTF16Len;
		OFDataArray *rest = nil;

		if (!ReadConsoleW(_handle, UTF16, length, &UTF16Len, NULL))
			@throw [OFReadFailedException
			    exceptionWithObject: self
				requestedLength: length];

		if (UTF16Len > 0 && _incompleteSurrogate != 0) {
			of_unichar_t c =
			    (((_incompleteSurrogate & 0x3FF) << 10) |
			    (UTF16[0] & 0x3FF)) + 0x10000;
			char UTF8[4];
			size_t UTF8Len;

			if ((UTF8Len = of_string_utf8_encode(c, UTF8)) == 0)
				@throw [OFInvalidEncodingException exception];

			if (UTF8Len <= length) {
				memcpy(buffer, UTF8, UTF8Len);
				j += UTF8Len;
			} else {
				if (rest == nil)
					rest = [OFDataArray dataArray];

				[rest addItems: UTF8
					 count: UTF8Len];
			}

			_incompleteSurrogate = 0;
		}

		for (size_t i = 0; i < UTF16Len; i++) {
			of_unichar_t c = UTF16[i];
			char UTF8[4];
			size_t UTF8Len;

			/* Missing high surrogate */
			if ((c & 0xFC00) == 0xDC00)
				@throw [OFInvalidEncodingException exception];

			if ((c & 0xFC00) == 0xD800) {
				of_char16_t next;

				if (UTF16Len <= i + 1) {
					_incompleteSurrogate = c;

					if (rest != nil) {
						char *items = [rest items];
						size_t count = [rest count];

						[self unreadFromBuffer: items
								length: count];
					}

					objc_autoreleasePoolPop(pool);

					return j;
				}

				next = UTF16[i + 1];

				if ((next & 0xFC00) != 0xDC00)
					@throw [OFInvalidEncodingException
					    exception];