From 4e155ec47e73ed95e15b9e875b3c7fdfcbf13496 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 16 Jul 2016 23:21:39 +0200 Subject: [PATCH] apps/openssl.c: UTF-y Windows argv. Windows never composes UTF-8 strings as result of user interaction such as passing command-line argument. The only way to compose one is programmatic conversion from WCHAR string, which in turn can be picked up on command line. [For reference, why not wmain, it's not an option on MinGW.] Reviewed-by: Richard Levitte --- Configurations/10-main.conf | 3 + apps/apps.h | 5 + apps/openssl.c | 5 + apps/win32_init.c | 304 ++++++++++++++++++++++++++++++++++++ 4 files changed, 317 insertions(+) create mode 100644 apps/win32_init.c diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf index 62e659eb3c..ec7a7d9300 100644 --- a/Configurations/10-main.conf +++ b/Configurations/10-main.conf @@ -1254,6 +1254,7 @@ sub vms_info { shared_target => "win-shared", # meaningless except it gives Configure a hint thread_scheme => "winthreads", dso_scheme => "win32", + apps_aux_src => add("win32_init.c"), }, "VC-noCE-common" => { inherit_from => [ "VC-common" ], @@ -1414,6 +1415,7 @@ sub vms_info { shared_rcflag => "--target=pe-i386", shared_extension => ".dll", multilib => "", + apps_aux_src => add("win32_init.c"), }, "mingw64" => { # As for OPENSSL_USE_APPLINK. Applink makes it possible to use @@ -1442,6 +1444,7 @@ sub vms_info { shared_rcflag => "--target=pe-x86-64", shared_extension => ".dll", multilib => "64", + apps_aux_src => add("win32_init.c"), }, #### UEFI diff --git a/apps/apps.h b/apps/apps.h index 5faf440200..22eead3a18 100644 --- a/apps/apps.h +++ b/apps/apps.h @@ -367,6 +367,11 @@ typedef struct args_st { * can be re-used. */ char **copy_argv(int *argc, char *argv[]); +/* + * Win32-specific argv initialization that splits OS-supplied UNICODE + * command line string to array of UTF8-encoded strings. + */ +void win32_utf8argv(int *argc, char **argv[]); # define PW_MIN_LENGTH 4 diff --git a/apps/openssl.c b/apps/openssl.c index 78ed023fd4..0f7176fd72 100644 --- a/apps/openssl.c +++ b/apps/openssl.c @@ -131,6 +131,11 @@ int main(int argc, char *argv[]) #if defined(OPENSSL_SYS_VMS) && defined(__DECC) copied_argv = argv = copy_argv(&argc, argv); +#elif defined(_WIN32) + /* + * Replace argv[] with UTF-8 encoded strings. + */ + win32_utf8argv(&argc, &argv); #endif p = getenv("OPENSSL_DEBUG_MEMORY"); diff --git a/apps/win32_init.c b/apps/win32_init.c new file mode 100644 index 0000000000..259e3f35ba --- /dev/null +++ b/apps/win32_init.c @@ -0,0 +1,304 @@ +/* + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include +#include +#include + +#if defined(CP_UTF8) + +static UINT saved_cp; +static int newargc; +static char **newargv; + +static void cleanup(void) +{ + int i; + + SetConsoleOutputCP(saved_cp); + + for (i = 0; i < newargc; i++) + free(newargv[i]); + + free(newargv); +} + +/* + * Incrementally [re]allocate newargv and keep it NULL-terminated. + */ +static int validate_argv(int argc) +{ + static int size = 0; + + if (argc >= size) { + char **ptr; + + while (argc >= size) + size += 64; + + ptr = realloc(newargv, size * sizeof(newargv[0])); + if (ptr == NULL) + return 0; + + (newargv = ptr)[argc] = NULL; + } else { + newargv[argc] = NULL; + } + + return 1; +} + +static int process_glob(WCHAR *wstr, int wlen) +{ + int i, slash, udlen; + WCHAR saved_char; + WIN32_FIND_DATAW data; + HANDLE h; + + /* + * Note that we support wildcard characters only in filename part + * of the path, and not in directories. Windows users are used to + * this, that's why recursive glob processing is not implemented. + */ + /* + * Start by looking for last slash or backslash, ... + */ + for (slash = 0, i = 0; i < wlen; i++) + if (wstr[i] == L'/' || wstr[i] == L'\\') + slash = i + 1; + /* + * ... then look for asterisk or question mark in the file name. + */ + for (i = slash; i < wlen; i++) + if (wstr[i] == L'*' || wstr[i] == L'?') + break; + + if (i == wlen) + return 0; /* definitely not a glob */ + + saved_char = wstr[wlen]; + wstr[wlen] = L'\0'; + h = FindFirstFileW(wstr, &data); + wstr[wlen] = saved_char; + if (h == INVALID_HANDLE_VALUE) + return 0; /* not a valid glob, just pass... */ + + if (slash) + udlen = WideCharToMultiByte(CP_UTF8, 0, wstr, slash, + NULL, 0, NULL, NULL); + else + udlen = 0; + + do { + int uflen; + char *arg; + + /* + * skip over . and .. + */ + if (data.cFileName[0] == L'.') { + if ((data.cFileName[1] == L'\0') || + (data.cFileName[1] == L'.' && data.cFileName[2] == L'\0')) + continue; + } + + if (!validate_argv(newargc + 1)) + break; + + /* + * -1 below means "scan for trailing '\0' *and* count it", + * so that |uflen| covers even trailing '\0'. + */ + uflen = WideCharToMultiByte(CP_UTF8, 0, data.cFileName, -1, + NULL, 0, NULL, NULL); + + arg = malloc(udlen + uflen); + if (arg == NULL) + break; + + if (udlen) + WideCharToMultiByte(CP_UTF8, 0, wstr, slash, + arg, udlen, NULL, NULL); + + WideCharToMultiByte(CP_UTF8, 0, data.cFileName, -1, + arg + udlen, uflen, NULL, NULL); + + newargv[newargc++] = arg; + } while (FindNextFileW(h, &data)); + + CloseHandle(h); + + return 1; +} + +void win32_utf8argv(int *argc, char **argv[]) +{ + const WCHAR *wcmdline; + WCHAR *warg, *wend, *p; + int wlen, ulen, valid = 1; + char *arg; + + newargc = 0; + newargv = NULL; + if (!validate_argv(newargc)) + return; + + wcmdline = GetCommandLineW(); + if (wcmdline == NULL) return; + + /* + * make a copy of the command line, since we might have to modify it... + */ + wlen = wcslen(wcmdline); + p = _alloca((wlen + 1) * sizeof(WCHAR)); + wcscpy(p, wcmdline); + + while (*p != L'\0') { + int in_quote = 0; + + if (*p == L' ' || *p == L'\t') { + p++; /* skip over white spaces */ + continue; + } + + /* + * Note: because we may need to fiddle with the number of backslashes, + * the argument string is copied into itself. This is safe because + * the number of characters will never expand. + */ + warg = wend = p; + while (*p != L'\0' + && (in_quote || (*p != L' ' && *p != L'\t'))) { + switch (*p) { + case L'\\': + /* + * Microsoft documentation on how backslashes are treated + * is: + * + * + Backslashes are interpreted literally, unless they + * immediately precede a double quotation mark. + * + If an even number of backslashes is followed by a double + * quotation mark, one backslash is placed in the argv array + * for every pair of backslashes, and the double quotation + * mark is interpreted as a string delimiter. + * + If an odd number of backslashes is followed by a double + * quotation mark, one backslash is placed in the argv array + * for every pair of backslashes, and the double quotation + * mark is "escaped" by the remaining backslash, causing a + * literal double quotation mark (") to be placed in argv. + * + * Ref: https://msdn.microsoft.com/en-us/library/17w5ykft.aspx + * + * Though referred page doesn't mention it, multiple qouble + * quotes are also special. Pair of double quotes in quoted + * string is counted as single double quote. + */ + { + const WCHAR *q = p; + int i; + + while (*p == L'\\') + p++; + + if (*p == L'"') { + int i; + + for (i = (p - q) / 2; i > 0; i--) + *wend++ = L'\\'; + + /* + * if odd amount of backslashes before the quote, + * said quote is part of the argument, not a delimiter + */ + if ((p - q) % 2 == 1) + *wend++ = *p++; + } else { + for (i = p - q; i > 0; i--) + *wend++ = L'\\'; + } + } + break; + case L'"': + /* + * Without the preceding backslash (or when preceded with an + * even number of backslashes), the double quote is a simple + * string delimiter and just slightly change the parsing state + */ + if (in_quote && p[1] == L'"') + *wend++ = *p++; + else + in_quote = !in_quote; + p++; + break; + default: + /* + * Any other non-delimiter character is just taken verbatim + */ + *wend++ = *p++; + } + } + + wlen = wend - warg; + + if (wlen == 0 || !process_glob(warg, wlen)) { + if (!validate_argv(newargc + 1)) { + valid = 0; + break; + } + + ulen = 0; + if (wlen > 0) { + ulen = WideCharToMultiByte(CP_UTF8, 0, warg, wlen, + NULL, 0, NULL, NULL); + if (ulen <= 0) + continue; + } + + arg = malloc(ulen + 1); + if (arg == NULL) { + valid = 0; + break; + } + + if (wlen > 0) + WideCharToMultiByte(CP_UTF8, 0, warg, wlen, + arg, ulen, NULL, NULL); + arg[ulen] = '\0'; + + newargv[newargc++] = arg; + } + } + + if (valid) { + saved_cp = GetConsoleOutputCP(); + SetConsoleOutputCP(CP_UTF8); + + *argc = newargc; + *argv = newargv; + + atexit(cleanup); + } else if (newargv != NULL) { + int i; + + for (i = 0; i < newargc; i++) + free(newargv[i]); + + free(newargv); + + newargc = 0; + newargv = NULL; + } + + return; +} +#else +void win32_utf8argv(int &argc, char **argv[]) +{ return; } +#endif -- 2.34.1