diff options
author | João Valverde <j@v6e.pt> | 2021-11-30 00:22:34 +0000 |
---|---|---|
committer | Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> | 2021-11-30 21:34:57 +0000 |
commit | 1e0cc18ae8acf470044378ce395198de48b31a83 (patch) | |
tree | 0fd2ca7a2feb2f6a0e2380efba0b25ee8f5f8ff8 /epan/strutil.c | |
parent | c18e44f5638aeb770cd5d7f50ddbb8f9d000099b (diff) |
epan: Remove duplication in format_text_wsp()
This function and format_text() are very similar so use a common
implementation for both.
Diffstat (limited to 'epan/strutil.c')
-rw-r--r-- | epan/strutil.c | 382 |
1 files changed, 61 insertions, 321 deletions
diff --git a/epan/strutil.c b/epan/strutil.c index 6ed66a0ceb..bd1117b913 100644 --- a/epan/strutil.c +++ b/epan/strutil.c @@ -192,35 +192,10 @@ get_token_len(const guchar *linep, const guchar *lineend, #define UNPOOP 0x1F4A9 -/* - * Given a wmem scope, a not-necessarily-null-terminated string, - * expected to be in UTF-8 but possibly containing invalid sequences - * (as it may have come from packet data), and the length of the string, - * generate a valid UTF-8 string from it, allocated in the specified - * wmem scope, that: - * - * shows printable Unicode characters as themselves; - * - * shows non-printable ASCII characters as C-style escapes (octal - * if not one of the standard ones such as LF -> '\n'); - * - * shows non-printable Unicode-but-not-ASCII characters as - * their universal character names; - * - * shows illegal UTF-8 sequences as a sequence of bytes represented - * as C-style hex escapes (XXX: Does not actually do this. Some illegal - * sequences, such as overlong encodings, the sequences reserved for - * UTF-16 surrogate halves (paired or unpaired), and values outside - * Unicode (i.e., the old sequences for code points above U+10FFFF) - * will be decoded in a permissive way. Other illegal sequences, - * such 0xFE and 0xFF and the presence of a continuation byte where - * not expected (or vice versa its absence), are replaced with - * REPLACEMENT CHARACTER.) - * - * and return a pointer to it. - */ -gchar * -format_text(wmem_allocator_t* allocator, const guchar *string, size_t len) +static gchar * +format_text_internal(wmem_allocator_t *allocator, + const guchar *string, size_t len, + gboolean replace_space) { FMTBUF_VARS; const guchar *stringend = string + len; @@ -239,6 +214,17 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len) */ FMTBUF_EXPAND(1); FMTBUF_PUTCHAR(c); + } else if (replace_space && g_ascii_isspace(c)) { + /* + * ASCII, so not part of a multi-byte UTF-8 sequence, but + * not printable, but is a space character; show it as a + * blank. + * + * Make sure there's enough room for one more byte, and add + * the blank. + */ + FMTBUF_EXPAND(1); + FMTBUF_PUTCHAR(' '); } else if (c < 128) { /* * ASCII, so not part of a multi-byte UTF-8 sequence, but not @@ -408,6 +394,16 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len) } fmtbuf[column] = uc | first; column += utf8_len; + } else if (replace_space && g_unichar_isspace(uc)) { + /* + * Not printable, but is a space character; show it + * as a blank. + * + * Make sure there's enough room for one more byte, + * and add the blank. + */ + FMTBUF_EXPAND(1); + FMTBUF_PUTCHAR(' '); } else if (c < 128) { /* * ASCII, but not printable. @@ -494,6 +490,40 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len) return fmtbuf; } +/* + * Given a wmem scope, a not-necessarily-null-terminated string, + * expected to be in UTF-8 but possibly containing invalid sequences + * (as it may have come from packet data), and the length of the string, + * generate a valid UTF-8 string from it, allocated in the specified + * wmem scope, that: + * + * shows printable Unicode characters as themselves; + * + * shows non-printable ASCII characters as C-style escapes (octal + * if not one of the standard ones such as LF -> '\n'); + * + * shows non-printable Unicode-but-not-ASCII characters as + * their universal character names; + * + * shows illegal UTF-8 sequences as a sequence of bytes represented + * as C-style hex escapes (XXX: Does not actually do this. Some illegal + * sequences, such as overlong encodings, the sequences reserved for + * UTF-16 surrogate halves (paired or unpaired), and values outside + * Unicode (i.e., the old sequences for code points above U+10FFFF) + * will be decoded in a permissive way. Other illegal sequences, + * such 0xFE and 0xFF and the presence of a continuation byte where + * not expected (or vice versa its absence), are replaced with + * REPLACEMENT CHARACTER.) + * + * and return a pointer to it. + */ +gchar * +format_text(wmem_allocator_t *allocator, + const guchar *string, size_t len) +{ + return format_text_internal(allocator, string, len, FALSE); +} + /** Given a wmem scope and a null-terminated string, expected to be in * UTF-8 but possibly containing invalid sequences (as it may have come * from packet data), and the length of the string, generate a valid @@ -515,7 +545,7 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len) gchar * format_text_string(wmem_allocator_t* allocator, const guchar *string) { - return format_text(allocator, string, strlen(string)); + return format_text_internal(allocator, string, strlen(string), FALSE); } /* @@ -527,297 +557,7 @@ format_text_string(wmem_allocator_t* allocator, const guchar *string) gchar * format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len) { - FMTBUF_VARS; - const guchar *stringend = string + len; - guchar c; - - while (string < stringend) { - /* - * Get the first byte of this character. - */ - c = *string++; - if (g_ascii_isprint(c)) { - /* - * Printable ASCII, so not part of a multi-byte UTF-8 sequence. - * Make sure there's enough room for one more byte, and add - * the character. - */ - FMTBUF_EXPAND(1); - FMTBUF_PUTCHAR(c); - } else if (g_ascii_isspace(c)) { - /* - * ASCII, so not part of a multi-byte UTF-8 sequence, but - * not printable, but is a space character; show it as a - * blank. - * - * Make sure there's enough room for one more byte, and add - * the blank. - */ - FMTBUF_EXPAND(1); - FMTBUF_PUTCHAR(' '); - } else if (c < 128) { - /* - * ASCII, so not part of a multi-byte UTF-8 sequence, but not - * printable. - * - * That requires a minimum of 2 bytes, one for the backslash - * and one for a letter, so make sure we have enough room - * for that, plus a trailing '\0'. - */ - FMTBUF_EXPAND(2); - FMTBUF_PUTCHAR('\\'); - switch (c) { - - case '\a': - FMTBUF_PUTCHAR('a'); - break; - - case '\b': - FMTBUF_PUTCHAR('b'); /* BS */ - break; - - case '\f': - FMTBUF_PUTCHAR('f'); /* FF */ - break; - - case '\n': - FMTBUF_PUTCHAR('n'); /* NL */ - break; - - case '\r': - FMTBUF_PUTCHAR('r'); /* CR */ - break; - - case '\t': - FMTBUF_PUTCHAR('t'); /* tab */ - break; - - case '\v': - FMTBUF_PUTCHAR('v'); - break; - - default: - /* - * We've already put the backslash, but this - * will put 3 more characters for the octal - * number; make sure we have enough room for - * that, plus the trailing '\0'. - */ - FMTBUF_EXPAND(3); - FMTBUF_PUTBYTE_OCTAL(c); - break; - } - } else { - /* - * We've fetched the first byte of a multi-byte UTF-8 - * sequence into c. - */ - int utf8_len; - guchar mask; - gunichar uc; - guchar first; - - if ((c & 0xe0) == 0xc0) { - /* Starts a 2-byte UTF-8 sequence; 1 byte left */ - utf8_len = 1; - mask = 0x1f; - } else if ((c & 0xf0) == 0xe0) { - /* Starts a 3-byte UTF-8 sequence; 2 bytes left */ - utf8_len = 2; - mask = 0x0f; - } else if ((c & 0xf8) == 0xf0) { - /* Starts a 4-byte UTF-8 sequence; 3 bytes left */ - utf8_len = 3; - mask = 0x07; - } else if ((c & 0xfc) == 0xf8) { - /* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */ - utf8_len = 4; - mask = 0x03; - } else if ((c & 0xfe) == 0xfc) { - /* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */ - utf8_len = 5; - mask = 0x01; - } else { - /* 0xfe or 0xff - not valid */ - utf8_len = -1; - } - if (utf8_len > 0) { - /* Try to construct the Unicode character */ - uc = c & mask; - for (int i = 0; i < utf8_len; i++) { - if (string >= stringend) { - /* - * Ran out of octets, so the character is - * incomplete. Put in a REPLACEMENT CHARACTER - * instead, and then continue the loop, which - * will terminate. - */ - uc = UNREPL; - break; - } - c = *string; - if ((c & 0xc0) != 0x80) { - /* - * Not valid UTF-8 continuation character; put in - * a replacement character, and then re-process - * this octet as the beginning of a new character. - */ - uc = UNREPL; - break; - } - string++; - uc = (uc << 6) | (c & 0x3f); - } - - /* - * If this isn't a valid Unicode character, put in - * a REPLACEMENT CHARACTER. - */ - if (!g_unichar_validate(uc)) - uc = UNREPL; - } else { - /* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */ - uc = UNREPL; - } - - /* - * OK, is it a printable Unicode character? - */ - if (g_unichar_isprint(uc)) { - /* - * Yes - put it into the string as UTF-8. - * This means that if it was an overlong - * encoding, this will put out the right - * sized encoding. - */ - if (uc < 0x80) { - first = 0; - utf8_len = 1; - } else if (uc < 0x800) { - first = 0xc0; - utf8_len = 2; - } else if (uc < 0x10000) { - first = 0xe0; - utf8_len = 3; - } else if (uc < 0x200000) { - first = 0xf0; - utf8_len = 4; - } else if (uc < 0x4000000) { - /* - * This should never happen, as Unicode doesn't - * go that high. - */ - first = 0xf8; - utf8_len = 5; - } else { - /* - * This should never happen, as Unicode doesn't - * go that high. - */ - first = 0xfc; - utf8_len = 6; - } - FMTBUF_EXPAND(utf8_len); - for (int i = utf8_len - 1; i > 0; i--) { - fmtbuf[column + i] = (uc & 0x3f) | 0x80; - uc >>= 6; - } - fmtbuf[column] = uc | first; - column += utf8_len; - } else if (g_unichar_isspace(uc)) { - /* - * Not printable, but is a space character; show it - * as a blank. - * - * Make sure there's enough room for one more byte, - * and add the blank. - */ - FMTBUF_EXPAND(1); - FMTBUF_PUTCHAR(' '); - } else if (c < 128) { - /* - * ASCII, but not printable. - * Yes, this could happen with an overlong encoding. - * - * That requires a minimum of 2 bytes, one for the - * backslash and one for a letter, so make sure we - * have enough room for that, plus a trailing '\0'. - */ - FMTBUF_EXPAND(2); - FMTBUF_PUTCHAR('\\'); - switch (c) { - - case '\a': - FMTBUF_PUTCHAR('a'); - break; - - case '\b': - FMTBUF_PUTCHAR('b'); /* BS */ - break; - - case '\f': - FMTBUF_PUTCHAR('f'); /* FF */ - break; - - case '\n': - FMTBUF_PUTCHAR('n'); /* NL */ - break; - - case '\r': - FMTBUF_PUTCHAR('r'); /* CR */ - break; - - case '\t': - FMTBUF_PUTCHAR('t'); /* tab */ - break; - - case '\v': - FMTBUF_PUTCHAR('v'); - break; - - default: - /* - * We've already put the backslash, but this - * will put 3 more characters for the octal - * number; make sure we have enough room for - * that, plus the trailing '\0'. - */ - FMTBUF_EXPAND(3); - FMTBUF_PUTBYTE_OCTAL(c); - break; - } - } else { - /* - * Unicode, but not printable, and not ASCII; - * put it out as \uxxxx or \Uxxxxxxxx. - */ - if (uc <= 0xFFFF) { - FMTBUF_EXPAND(6); - FMTBUF_PUTCHAR('\\'); - FMTBUF_PUTCHAR('u'); - FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]); - } else { - FMTBUF_EXPAND(10); - FMTBUF_PUTCHAR('\\'); - FMTBUF_PUTCHAR('U'); - FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]); - FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]); - } - } - } - } - - FMTBUF_ENDSTR; - return fmtbuf; + return format_text_internal(allocator, string, len, TRUE); } /* |