aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/tshark.pod14
-rw-r--r--epan/print_stream.c49
-rw-r--r--wsutil/utf8_entities.h7
3 files changed, 69 insertions, 1 deletions
diff --git a/doc/tshark.pod b/doc/tshark.pod
index fb88d53d9a..77082c7a1e 100644
--- a/doc/tshark.pod
+++ b/doc/tshark.pod
@@ -1741,6 +1741,20 @@ personal preferences file.
=back
+=head1 OUTPUT
+
+B<TShark> uses UTF-8 to represent strings internally. In some cases the
+output might not be valid. For example, a dissector might generate
+invalid UTF-8 character sequences. Programs reading B<TShark> output
+should expect UTF-8 and be prepared for invalid output.
+
+If B<TShark> detects that it is writing to a TTY on UNIX or Linux and
+the locale does not support UTF-8, output will be re-encoded to match the
+current locale.
+
+If B<TShark> detects that it is writing to a TTY on Windows, output will be
+encoded as UTF-16LE.
+
=head1 ENVIRONMENT VARIABLES
=over 4
diff --git a/epan/print_stream.c b/epan/print_stream.c
index f53965c75b..740773af2e 100644
--- a/epan/print_stream.c
+++ b/epan/print_stream.c
@@ -26,6 +26,12 @@
#include <stdio.h>
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <string.h>
+#endif
+
#include <glib.h>
#include <epan/print_stream.h>
@@ -104,6 +110,13 @@ typedef struct {
#define MAX_INDENT 160
+#ifdef _WIN32
+static char *to_codeset = "UTF-16LE";
+#else
+static char *tty_codeset = NULL;
+static char *to_codeset = NULL;
+#endif
+
static gboolean
print_line_text(print_stream_t *self, int indent, const char *line)
{
@@ -128,7 +141,41 @@ print_line_text(print_stream_t *self, int indent, const char *line)
ret = fwrite(spaces, 1, num_spaces, output->fh);
if (ret == num_spaces) {
- fputs(line, output->fh);
+ gchar *tty_out = NULL;
+
+#ifndef _WIN32
+ /* Is there a more reliable way to do this? */
+ if (!tty_codeset) {
+ gchar *upper_codeset;
+
+ tty_codeset = g_get_codeset();
+ upper_codeset = g_ascii_strup(tty_codeset, -1);
+ if (!strstr(upper_codeset, "UTF-8") && !strstr(upper_codeset, "UTF8")) {
+ to_codeset = tty_codeset;
+ }
+ g_free(upper_codeset);
+ }
+#endif
+
+ if (ws_isatty(ws_fileno(output->fh)) && to_codeset) {
+ /* XXX Allocating a fresh buffer every line probably isn't the
+ * most efficient way to do this. However, this has the side
+ * effect of scrubbing invalid output.
+ */
+ tty_out = g_convert_with_fallback(line, -1, to_codeset, "UTF-8", "?", NULL, NULL, NULL);
+ }
+
+ if (tty_out) {
+#ifdef _WIN32
+ DWORD out_len = (DWORD) wcslen((wchar_t *) tty_out);
+ WriteConsoleW((HANDLE)_get_osfhandle(_fileno(output->fh)), tty_out, out_len, &out_len, NULL);
+#else
+ fputs(tty_out, output->fh);
+#endif
+ g_free(tty_out);
+ } else {
+ fputs(line, output->fh);
+ }
putc('\n', output->fh);
}
return !ferror(output->fh);
diff --git a/wsutil/utf8_entities.h b/wsutil/utf8_entities.h
index dc5deba554..13dba64270 100644
--- a/wsutil/utf8_entities.h
+++ b/wsutil/utf8_entities.h
@@ -29,6 +29,12 @@
* http://www.fileformat.info/info/unicode/
* http://www.utf8-chartable.de/
* and other places
+ *
+ * While many modern systems default to UTF-8 and handle it well, some do
+ * not. The Windows console is a notable example. When adding a glyph below
+ * you probably shouldn't stray too far from code page 437 or WGL4:
+ * https://en.wikipedia.org/wiki/Code_page_437
+ * https://en.wikipedia.org/wiki/Windows_Glyph_List_4
*/
#define UTF8_DEGREE_SIGN "\xc2\xb0" /* 176 / 0xb0 */
@@ -43,6 +49,7 @@
#define UTF8_RIGHTWARDS_ARROW "\xe2\x86\x92" /* 8594 / 0x2192 */
#define UTF8_LEFT_RIGHT_ARROW "\xe2\x86\x94" /* 8596 / 0x2194 */
+/* OS X command key */
#define UTF8_PLACE_OF_INTEREST_SIGN "\xe2\x8c\x98" /* 8984 / 0x2318 */
#define UTF8_SYMBOL_FOR_NULL "\xe2\x90\x80" /* 9216 / 0x2400 */