aboutsummaryrefslogtreecommitdiffstats
path: root/epan/strutil.c
diff options
context:
space:
mode:
authorJoão Valverde <j@v6e.pt>2021-11-30 00:22:34 +0000
committerWireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>2021-11-30 21:34:57 +0000
commit1e0cc18ae8acf470044378ce395198de48b31a83 (patch)
tree0fd2ca7a2feb2f6a0e2380efba0b25ee8f5f8ff8 /epan/strutil.c
parentc18e44f5638aeb770cd5d7f50ddbb8f9d000099b (diff)
epan: Remove duplication in format_text_wsp()
This function and format_text() are very similar so use a common implementation for both.
Diffstat (limited to 'epan/strutil.c')
-rw-r--r--epan/strutil.c382
1 files changed, 61 insertions, 321 deletions
diff --git a/epan/strutil.c b/epan/strutil.c
index 6ed66a0ceb..bd1117b913 100644
--- a/epan/strutil.c
+++ b/epan/strutil.c
@@ -192,35 +192,10 @@ get_token_len(const guchar *linep, const guchar *lineend,
#define UNPOOP 0x1F4A9
-/*
- * Given a wmem scope, a not-necessarily-null-terminated string,
- * expected to be in UTF-8 but possibly containing invalid sequences
- * (as it may have come from packet data), and the length of the string,
- * generate a valid UTF-8 string from it, allocated in the specified
- * wmem scope, that:
- *
- * shows printable Unicode characters as themselves;
- *
- * shows non-printable ASCII characters as C-style escapes (octal
- * if not one of the standard ones such as LF -> '\n');
- *
- * shows non-printable Unicode-but-not-ASCII characters as
- * their universal character names;
- *
- * shows illegal UTF-8 sequences as a sequence of bytes represented
- * as C-style hex escapes (XXX: Does not actually do this. Some illegal
- * sequences, such as overlong encodings, the sequences reserved for
- * UTF-16 surrogate halves (paired or unpaired), and values outside
- * Unicode (i.e., the old sequences for code points above U+10FFFF)
- * will be decoded in a permissive way. Other illegal sequences,
- * such 0xFE and 0xFF and the presence of a continuation byte where
- * not expected (or vice versa its absence), are replaced with
- * REPLACEMENT CHARACTER.)
- *
- * and return a pointer to it.
- */
-gchar *
-format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
+static gchar *
+format_text_internal(wmem_allocator_t *allocator,
+ const guchar *string, size_t len,
+ gboolean replace_space)
{
FMTBUF_VARS;
const guchar *stringend = string + len;
@@ -239,6 +214,17 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(c);
+ } else if (replace_space && g_ascii_isspace(c)) {
+ /*
+ * ASCII, so not part of a multi-byte UTF-8 sequence, but
+ * not printable, but is a space character; show it as a
+ * blank.
+ *
+ * Make sure there's enough room for one more byte, and add
+ * the blank.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
@@ -408,6 +394,16 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
}
fmtbuf[column] = uc | first;
column += utf8_len;
+ } else if (replace_space && g_unichar_isspace(uc)) {
+ /*
+ * Not printable, but is a space character; show it
+ * as a blank.
+ *
+ * Make sure there's enough room for one more byte,
+ * and add the blank.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, but not printable.
@@ -494,6 +490,40 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
return fmtbuf;
}
+/*
+ * Given a wmem scope, a not-necessarily-null-terminated string,
+ * expected to be in UTF-8 but possibly containing invalid sequences
+ * (as it may have come from packet data), and the length of the string,
+ * generate a valid UTF-8 string from it, allocated in the specified
+ * wmem scope, that:
+ *
+ * shows printable Unicode characters as themselves;
+ *
+ * shows non-printable ASCII characters as C-style escapes (octal
+ * if not one of the standard ones such as LF -> '\n');
+ *
+ * shows non-printable Unicode-but-not-ASCII characters as
+ * their universal character names;
+ *
+ * shows illegal UTF-8 sequences as a sequence of bytes represented
+ * as C-style hex escapes (XXX: Does not actually do this. Some illegal
+ * sequences, such as overlong encodings, the sequences reserved for
+ * UTF-16 surrogate halves (paired or unpaired), and values outside
+ * Unicode (i.e., the old sequences for code points above U+10FFFF)
+ * will be decoded in a permissive way. Other illegal sequences,
+ * such 0xFE and 0xFF and the presence of a continuation byte where
+ * not expected (or vice versa its absence), are replaced with
+ * REPLACEMENT CHARACTER.)
+ *
+ * and return a pointer to it.
+ */
+gchar *
+format_text(wmem_allocator_t *allocator,
+ const guchar *string, size_t len)
+{
+ return format_text_internal(allocator, string, len, FALSE);
+}
+
/** Given a wmem scope and a null-terminated string, expected to be in
* UTF-8 but possibly containing invalid sequences (as it may have come
* from packet data), and the length of the string, generate a valid
@@ -515,7 +545,7 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
gchar *
format_text_string(wmem_allocator_t* allocator, const guchar *string)
{
- return format_text(allocator, string, strlen(string));
+ return format_text_internal(allocator, string, strlen(string), FALSE);
}
/*
@@ -527,297 +557,7 @@ format_text_string(wmem_allocator_t* allocator, const guchar *string)
gchar *
format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
{
- FMTBUF_VARS;
- const guchar *stringend = string + len;
- guchar c;
-
- while (string < stringend) {
- /*
- * Get the first byte of this character.
- */
- c = *string++;
- if (g_ascii_isprint(c)) {
- /*
- * Printable ASCII, so not part of a multi-byte UTF-8 sequence.
- * Make sure there's enough room for one more byte, and add
- * the character.
- */
- FMTBUF_EXPAND(1);
- FMTBUF_PUTCHAR(c);
- } else if (g_ascii_isspace(c)) {
- /*
- * ASCII, so not part of a multi-byte UTF-8 sequence, but
- * not printable, but is a space character; show it as a
- * blank.
- *
- * Make sure there's enough room for one more byte, and add
- * the blank.
- */
- FMTBUF_EXPAND(1);
- FMTBUF_PUTCHAR(' ');
- } else if (c < 128) {
- /*
- * ASCII, so not part of a multi-byte UTF-8 sequence, but not
- * printable.
- *
- * That requires a minimum of 2 bytes, one for the backslash
- * and one for a letter, so make sure we have enough room
- * for that, plus a trailing '\0'.
- */
- FMTBUF_EXPAND(2);
- FMTBUF_PUTCHAR('\\');
- switch (c) {
-
- case '\a':
- FMTBUF_PUTCHAR('a');
- break;
-
- case '\b':
- FMTBUF_PUTCHAR('b'); /* BS */
- break;
-
- case '\f':
- FMTBUF_PUTCHAR('f'); /* FF */
- break;
-
- case '\n':
- FMTBUF_PUTCHAR('n'); /* NL */
- break;
-
- case '\r':
- FMTBUF_PUTCHAR('r'); /* CR */
- break;
-
- case '\t':
- FMTBUF_PUTCHAR('t'); /* tab */
- break;
-
- case '\v':
- FMTBUF_PUTCHAR('v');
- break;
-
- default:
- /*
- * We've already put the backslash, but this
- * will put 3 more characters for the octal
- * number; make sure we have enough room for
- * that, plus the trailing '\0'.
- */
- FMTBUF_EXPAND(3);
- FMTBUF_PUTBYTE_OCTAL(c);
- break;
- }
- } else {
- /*
- * We've fetched the first byte of a multi-byte UTF-8
- * sequence into c.
- */
- int utf8_len;
- guchar mask;
- gunichar uc;
- guchar first;
-
- if ((c & 0xe0) == 0xc0) {
- /* Starts a 2-byte UTF-8 sequence; 1 byte left */
- utf8_len = 1;
- mask = 0x1f;
- } else if ((c & 0xf0) == 0xe0) {
- /* Starts a 3-byte UTF-8 sequence; 2 bytes left */
- utf8_len = 2;
- mask = 0x0f;
- } else if ((c & 0xf8) == 0xf0) {
- /* Starts a 4-byte UTF-8 sequence; 3 bytes left */
- utf8_len = 3;
- mask = 0x07;
- } else if ((c & 0xfc) == 0xf8) {
- /* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
- utf8_len = 4;
- mask = 0x03;
- } else if ((c & 0xfe) == 0xfc) {
- /* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
- utf8_len = 5;
- mask = 0x01;
- } else {
- /* 0xfe or 0xff - not valid */
- utf8_len = -1;
- }
- if (utf8_len > 0) {
- /* Try to construct the Unicode character */
- uc = c & mask;
- for (int i = 0; i < utf8_len; i++) {
- if (string >= stringend) {
- /*
- * Ran out of octets, so the character is
- * incomplete. Put in a REPLACEMENT CHARACTER
- * instead, and then continue the loop, which
- * will terminate.
- */
- uc = UNREPL;
- break;
- }
- c = *string;
- if ((c & 0xc0) != 0x80) {
- /*
- * Not valid UTF-8 continuation character; put in
- * a replacement character, and then re-process
- * this octet as the beginning of a new character.
- */
- uc = UNREPL;
- break;
- }
- string++;
- uc = (uc << 6) | (c & 0x3f);
- }
-
- /*
- * If this isn't a valid Unicode character, put in
- * a REPLACEMENT CHARACTER.
- */
- if (!g_unichar_validate(uc))
- uc = UNREPL;
- } else {
- /* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
- uc = UNREPL;
- }
-
- /*
- * OK, is it a printable Unicode character?
- */
- if (g_unichar_isprint(uc)) {
- /*
- * Yes - put it into the string as UTF-8.
- * This means that if it was an overlong
- * encoding, this will put out the right
- * sized encoding.
- */
- if (uc < 0x80) {
- first = 0;
- utf8_len = 1;
- } else if (uc < 0x800) {
- first = 0xc0;
- utf8_len = 2;
- } else if (uc < 0x10000) {
- first = 0xe0;
- utf8_len = 3;
- } else if (uc < 0x200000) {
- first = 0xf0;
- utf8_len = 4;
- } else if (uc < 0x4000000) {
- /*
- * This should never happen, as Unicode doesn't
- * go that high.
- */
- first = 0xf8;
- utf8_len = 5;
- } else {
- /*
- * This should never happen, as Unicode doesn't
- * go that high.
- */
- first = 0xfc;
- utf8_len = 6;
- }
- FMTBUF_EXPAND(utf8_len);
- for (int i = utf8_len - 1; i > 0; i--) {
- fmtbuf[column + i] = (uc & 0x3f) | 0x80;
- uc >>= 6;
- }
- fmtbuf[column] = uc | first;
- column += utf8_len;
- } else if (g_unichar_isspace(uc)) {
- /*
- * Not printable, but is a space character; show it
- * as a blank.
- *
- * Make sure there's enough room for one more byte,
- * and add the blank.
- */
- FMTBUF_EXPAND(1);
- FMTBUF_PUTCHAR(' ');
- } else if (c < 128) {
- /*
- * ASCII, but not printable.
- * Yes, this could happen with an overlong encoding.
- *
- * That requires a minimum of 2 bytes, one for the
- * backslash and one for a letter, so make sure we
- * have enough room for that, plus a trailing '\0'.
- */
- FMTBUF_EXPAND(2);
- FMTBUF_PUTCHAR('\\');
- switch (c) {
-
- case '\a':
- FMTBUF_PUTCHAR('a');
- break;
-
- case '\b':
- FMTBUF_PUTCHAR('b'); /* BS */
- break;
-
- case '\f':
- FMTBUF_PUTCHAR('f'); /* FF */
- break;
-
- case '\n':
- FMTBUF_PUTCHAR('n'); /* NL */
- break;
-
- case '\r':
- FMTBUF_PUTCHAR('r'); /* CR */
- break;
-
- case '\t':
- FMTBUF_PUTCHAR('t'); /* tab */
- break;
-
- case '\v':
- FMTBUF_PUTCHAR('v');
- break;
-
- default:
- /*
- * We've already put the backslash, but this
- * will put 3 more characters for the octal
- * number; make sure we have enough room for
- * that, plus the trailing '\0'.
- */
- FMTBUF_EXPAND(3);
- FMTBUF_PUTBYTE_OCTAL(c);
- break;
- }
- } else {
- /*
- * Unicode, but not printable, and not ASCII;
- * put it out as \uxxxx or \Uxxxxxxxx.
- */
- if (uc <= 0xFFFF) {
- FMTBUF_EXPAND(6);
- FMTBUF_PUTCHAR('\\');
- FMTBUF_PUTCHAR('u');
- FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
- } else {
- FMTBUF_EXPAND(10);
- FMTBUF_PUTCHAR('\\');
- FMTBUF_PUTCHAR('U');
- FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
- FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
- }
- }
- }
- }
-
- FMTBUF_ENDSTR;
- return fmtbuf;
+ return format_text_internal(allocator, string, len, TRUE);
}
/*