diff options
-rw-r--r-- | doc/tshark.pod | 14 | ||||
-rw-r--r-- | epan/print_stream.c | 49 | ||||
-rw-r--r-- | wsutil/utf8_entities.h | 7 |
3 files changed, 69 insertions, 1 deletions
diff --git a/doc/tshark.pod b/doc/tshark.pod index fb88d53d9a..77082c7a1e 100644 --- a/doc/tshark.pod +++ b/doc/tshark.pod @@ -1741,6 +1741,20 @@ personal preferences file. =back +=head1 OUTPUT + +B<TShark> uses UTF-8 to represent strings internally. In some cases the +output might not be valid. For example, a dissector might generate +invalid UTF-8 character sequences. Programs reading B<TShark> output +should expect UTF-8 and be prepared for invalid output. + +If B<TShark> detects that it is writing to a TTY on UNIX or Linux and +the locale does not support UTF-8, output will be re-encoded to match the +current locale. + +If B<TShark> detects that it is writing to a TTY on Windows, output will be +encoded as UTF-16LE. + =head1 ENVIRONMENT VARIABLES =over 4 diff --git a/epan/print_stream.c b/epan/print_stream.c index f53965c75b..740773af2e 100644 --- a/epan/print_stream.c +++ b/epan/print_stream.c @@ -26,6 +26,12 @@ #include <stdio.h> +#ifdef _WIN32 +#include <windows.h> +#else +#include <string.h> +#endif + #include <glib.h> #include <epan/print_stream.h> @@ -104,6 +110,13 @@ typedef struct { #define MAX_INDENT 160 +#ifdef _WIN32 +static char *to_codeset = "UTF-16LE"; +#else +static char *tty_codeset = NULL; +static char *to_codeset = NULL; +#endif + static gboolean print_line_text(print_stream_t *self, int indent, const char *line) { @@ -128,7 +141,41 @@ print_line_text(print_stream_t *self, int indent, const char *line) ret = fwrite(spaces, 1, num_spaces, output->fh); if (ret == num_spaces) { - fputs(line, output->fh); + gchar *tty_out = NULL; + +#ifndef _WIN32 + /* Is there a more reliable way to do this? */ + if (!tty_codeset) { + gchar *upper_codeset; + + tty_codeset = g_get_codeset(); + upper_codeset = g_ascii_strup(tty_codeset, -1); + if (!strstr(upper_codeset, "UTF-8") && !strstr(upper_codeset, "UTF8")) { + to_codeset = tty_codeset; + } + g_free(upper_codeset); + } +#endif + + if (ws_isatty(ws_fileno(output->fh)) && to_codeset) { + /* XXX Allocating a fresh buffer every line probably isn't the + * most efficient way to do this. However, this has the side + * effect of scrubbing invalid output. + */ + tty_out = g_convert_with_fallback(line, -1, to_codeset, "UTF-8", "?", NULL, NULL, NULL); + } + + if (tty_out) { +#ifdef _WIN32 + DWORD out_len = (DWORD) wcslen((wchar_t *) tty_out); + WriteConsoleW((HANDLE)_get_osfhandle(_fileno(output->fh)), tty_out, out_len, &out_len, NULL); +#else + fputs(tty_out, output->fh); +#endif + g_free(tty_out); + } else { + fputs(line, output->fh); + } putc('\n', output->fh); } return !ferror(output->fh); diff --git a/wsutil/utf8_entities.h b/wsutil/utf8_entities.h index dc5deba554..13dba64270 100644 --- a/wsutil/utf8_entities.h +++ b/wsutil/utf8_entities.h @@ -29,6 +29,12 @@ * http://www.fileformat.info/info/unicode/ * http://www.utf8-chartable.de/ * and other places + * + * While many modern systems default to UTF-8 and handle it well, some do + * not. The Windows console is a notable example. When adding a glyph below + * you probably shouldn't stray too far from code page 437 or WGL4: + * https://en.wikipedia.org/wiki/Code_page_437 + * https://en.wikipedia.org/wiki/Windows_Glyph_List_4 */ #define UTF8_DEGREE_SIGN "\xc2\xb0" /* 176 / 0xb0 */ @@ -43,6 +49,7 @@ #define UTF8_RIGHTWARDS_ARROW "\xe2\x86\x92" /* 8594 / 0x2192 */ #define UTF8_LEFT_RIGHT_ARROW "\xe2\x86\x94" /* 8596 / 0x2194 */ +/* OS X command key */ #define UTF8_PLACE_OF_INTEREST_SIGN "\xe2\x8c\x98" /* 8984 / 0x2318 */ #define UTF8_SYMBOL_FOR_NULL "\xe2\x90\x80" /* 9216 / 0x2400 */ |