aboutsummaryrefslogtreecommitdiffstats
path: root/epan
diff options
context:
space:
mode:
authorGuy Harris <guy@alum.mit.edu>2019-07-25 05:23:36 -0700
committerGuy Harris <guy@alum.mit.edu>2019-07-25 14:50:40 +0000
commitedd5eaa57e4eb9157d7593dff6a58e030bf8987e (patch)
treed557457893f9ee30c9fe3583c85bd87e1094536e /epan
parentdd5f2bd054710910788656e068ad8853089c84c5 (diff)
Don't format printable non-ASCII Unicode characters as escape sequences.
Note that even strings fetched with ENC_ASCII may contain them - bytes with the 8th bit set get mapped to REPLACEMENT CHARACTER. This means we can format STR_UNICODE fields with format_text(); do so. Bug: 1372 Change-Id: Ia32c3a92d220ac5174ecd25f33e2d1f85cfb8cb8 Reviewed-on: https://code.wireshark.org/review/34080 Reviewed-by: Guy Harris <guy@alum.mit.edu>
Diffstat (limited to 'epan')
-rw-r--r--epan/proto.c3
-rw-r--r--epan/strutil.c742
2 files changed, 590 insertions, 155 deletions
diff --git a/epan/proto.c b/epan/proto.c
index 9d8a0a1a39..95259e2bd2 100644
--- a/epan/proto.c
+++ b/epan/proto.c
@@ -1016,8 +1016,7 @@ hfinfo_format_text(wmem_allocator_t *scope, const header_field_info *hfinfo,
return format_text_wsp(string, strlen(string));
*/
case STR_UNICODE:
- /* XXX, format_unicode_text() */
- return wmem_strdup(scope, string);
+ return format_text(scope, string, strlen(string));
}
return format_text(scope, string, strlen(string));
diff --git a/epan/strutil.c b/epan/strutil.c
index d8187674d6..f9fb655082 100644
--- a/epan/strutil.c
+++ b/epan/strutil.c
@@ -128,100 +128,364 @@ get_token_len(const guchar *linep, const guchar *lineend,
#define INITIAL_FMTBUF_SIZE 128
/*
- * Given a string, generate a string from it that shows non-printable
- * characters as C-style escapes, and return a pointer to it.
+ * Declare, and initialize, the variables used for an output buffer.
+ */
+#define FMTBUF_VARS \
+ gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
+ guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
+ guint column = 0
+
+/*
+ * Expand the buffer to be large enough to add nbytes bytes, plus a
+ * terminating '\0'.
+ */
+#define FMTBUF_EXPAND(nbytes) \
+ /* \
+ * Is there enough room for those bytes and also enough room for \
+ * a terminating '\0'? \
+ */ \
+ if (column+(nbytes+1) >= fmtbuf_len) { \
+ /* \
+ * Double the buffer's size if it's not big enough. \
+ * The size of the buffer starts at 128, so doubling its size \
+ * adds at least another 128 bytes, which is more than enough \
+ * for one more character plus a terminating '\0'. \
+ */ \
+ fmtbuf_len *= 2; \
+ fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
+ }
+
+/*
+ * Put a byte into the buffer; space must have been ensured for it.
+ */
+#define FMTBUF_PUTCHAR(b) \
+ fmtbuf[column] = (b); \
+ column++
+
+/*
+ * Add the one-byte argument, as an octal escape sequence, to the end
+ * of the buffer.
+ */
+#define FMTBUF_PUTBYTE_OCTAL(b) \
+ FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
+ FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
+ FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
+
+/*
+ * Add the one-byte argument, as a hex escape sequence, to the end
+ * of the buffer.
+ */
+#define FMTBUF_PUTBYTE_HEX(b) \
+ FMTBUF_PUTCHAR('\\'); \
+ FMTBUF_PUTCHAR('x'); \
+ FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
+ FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
+
+/*
+ * Put the trailing '\0' at the end of the buffer.
+ */
+#define FMTBUF_ENDSTR \
+ fmtbuf[column] = '\0'
+
+/* REPLACEMENT CHARACTER */
+#define UNREPL 0xFFFD
+
+#define UNPOOP 0x1F4A9
+
+/*
+ * Given a string, expected to be in UTF-8 but possibly containing
+ * invalid sequences (as it may have come from packet data), generate
+ * a valid UTF-8 string from it, allocated with the specified wmem
+ * allocator, that:
+ *
+ * shows printable Unicode characters as themselves;
+ *
+ * shows non-printable ASCII characters as C-style escapes (octal
+ * if not one of the standard ones such as LF -> '\n');
+ *
+ * shows non-printable Unicode-but-not-ASCII characters as
+ * their universal character names;
+ *
+ * shows illegal UTF-8 sequences as a sequence of bytes represented
+ * as C-style hex escapes;
+ *
+ * and return a pointer to it.
*/
gchar *
format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
{
- gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
- int fmtbuf_len = INITIAL_FMTBUF_SIZE;
- int column;
+ FMTBUF_VARS;
const guchar *stringend = string + len;
guchar c;
- int i;
- column = 0;
while (string < stringend) {
/*
- * Is there enough room for this character, if it expands to
- * a backslash plus 3 octal digits (which is the most it can
- * expand to), and also enough room for a terminating '\0'?
+ * Get the first byte of this character.
*/
- if (column+3+1 >= fmtbuf_len) {
- /*
- * Double the buffer's size if it's not big enough.
- * The size of the buffer starts at 128, so doubling its size
- * adds at least another 128 bytes, which is more than enough
- * for one more character plus a terminating '\0'.
- */
- fmtbuf_len *= 2;
- fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
- }
c = *string++;
-
if (g_ascii_isprint(c)) {
- fmtbuf[column] = c;
- column++;
- } else {
- fmtbuf[column] = '\\';
- column++;
+ /*
+ * Printable ASCII, so not part of a multi-byte UTF-8 sequence.
+ * Make sure there's enough room for one more byte, and add
+ * the character.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(c);
+ } else if (c < 128) {
+ /*
+ * ASCII, so not part of a multi-byte UTF-8 sequence, but not
+ * printable.
+ *
+ * That requires a minimum of 2 bytes, one for the backslash
+ * and one for a letter, so make sure we have enough room
+ * for that, plus a trailing '\0'.
+ */
+ FMTBUF_EXPAND(2);
+ FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
- fmtbuf[column] = 'a';
- column++;
+ FMTBUF_PUTCHAR('a');
break;
case '\b':
- fmtbuf[column] = 'b'; /* BS */
- column++;
+ FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
- fmtbuf[column] = 'f'; /* FF */
- column++;
+ FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
- fmtbuf[column] = 'n'; /* NL */
- column++;
+ FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
- fmtbuf[column] = 'r'; /* CR */
- column++;
+ FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
- fmtbuf[column] = 't'; /* tab */
- column++;
+ FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
- fmtbuf[column] = 'v';
- column++;
+ FMTBUF_PUTCHAR('v');
break;
default:
- i = (c>>6)&03;
- fmtbuf[column] = i + '0';
- column++;
- i = (c>>3)&07;
- fmtbuf[column] = i + '0';
- column++;
- i = (c>>0)&07;
- fmtbuf[column] = i + '0';
- column++;
+ /*
+ * We've already put the backslash, but this
+ * will put 3 more characters for the octal
+ * number; make sure we have enough room for
+ * that, plus the trailing '\0'.
+ */
+ FMTBUF_EXPAND(3);
+ FMTBUF_PUTBYTE_OCTAL(c);
break;
}
+ } else {
+ /*
+ * We've fetched the first byte of a multi-byte UTF-8
+ * sequence into c.
+ */
+ int utf8_len;
+ guchar mask;
+ gunichar uc;
+ guchar first;
+
+ if ((c & 0xe8) == 0xc0) {
+ /* Starts a 2-byte UTF-8 sequence; 1 byte left */
+ utf8_len = 1;
+ mask = 0x1f;
+ } else if ((c & 0xf0) == 0xe0) {
+ /* Starts a 3-byte UTF-8 sequence; 2 bytes left */
+ utf8_len = 2;
+ mask = 0x0f;
+ } else if ((c & 0xf8) == 0xf0) {
+ /* Starts a 4-byte UTF-8 sequence; 3 bytes left */
+ utf8_len = 3;
+ mask = 0x07;
+ } else if ((c & 0xfc) == 0xf8) {
+ /* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
+ utf8_len = 4;
+ mask = 0x03;
+ } else if ((c & 0xfe) == 0xfc) {
+ /* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
+ utf8_len = 5;
+ mask = 0x01;
+ } else {
+ /* 0xfe or 0xff - not valid */
+ utf8_len = -1;
+ }
+ if (utf8_len > 0) {
+ /* Try to construct the Unicode character */
+ uc = c & mask;
+ for (int i = 0; i < utf8_len; i++) {
+ if (string >= stringend) {
+ /*
+ * Ran out of octets, so the character is
+ * incomplete. Put in a REPLACEMENT CHARACTER
+ * instead, and then continue the loop, which
+ * will terminate.
+ */
+ uc = UNREPL;
+ break;
+ }
+ c = *string;
+ if ((c & 0xc0) != 0x80) {
+ /*
+ * Not valid UTF-8 continuation character; put in
+ * a replacement character, and then re-process
+ * this octet as the beginning of a new character.
+ */
+ uc = UNREPL;
+ break;
+ }
+ string++;
+ uc = (uc << 6) | (c & 0x3f);
+ }
+
+ /*
+ * If this isn't a valid Unicode character, put in
+ * a REPLACEMENT CHARACTER.
+ */
+ if (!g_unichar_validate(uc))
+ uc = UNREPL;
+ } else {
+ /* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
+ uc = UNREPL;
+ }
+
+ /*
+ * OK, is it a printable Unicode character?
+ */
+ if (g_unichar_isprint(uc)) {
+ /*
+ * Yes - put it into the string as UTF-8.
+ * This means that if it was an overlong
+ * encoding, this will put out the right
+ * sized encoding.
+ */
+ if (uc < 0x80) {
+ first = 0;
+ utf8_len = 1;
+ } else if (uc < 0x800) {
+ first = 0xc0;
+ utf8_len = 2;
+ } else if (uc < 0x10000) {
+ first = 0xe0;
+ utf8_len = 3;
+ } else if (uc < 0x200000) {
+ first = 0xf0;
+ utf8_len = 4;
+ } else if (uc < 0x4000000) {
+ /*
+ * This should never happen, as Unicode doesn't
+ * go that high.
+ */
+ first = 0xf8;
+ utf8_len = 5;
+ } else {
+ /*
+ * This should never happen, as Unicode doesn't
+ * go that high.
+ */
+ first = 0xfc;
+ utf8_len = 6;
+ }
+ FMTBUF_EXPAND(utf8_len);
+ for (int i = utf8_len - 1; i > 0; i--) {
+ fmtbuf[column + i] = (uc & 0x3f) | 0x80;
+ uc >>= 6;
+ }
+ fmtbuf[column] = uc | first;
+ column += utf8_len;
+ } else if (c < 128) {
+ /*
+ * ASCII, but not printable.
+ * Yes, this could happen with an overlong encoding.
+ *
+ * That requires a minimum of 2 bytes, one for the
+ * backslash and one for a letter, so make sure we
+ * have enough room for that, plus a trailing '\0'.
+ */
+ FMTBUF_EXPAND(2);
+ FMTBUF_PUTCHAR('\\');
+ switch (c) {
+
+ case '\a':
+ FMTBUF_PUTCHAR('a');
+ break;
+
+ case '\b':
+ FMTBUF_PUTCHAR('b'); /* BS */
+ break;
+
+ case '\f':
+ FMTBUF_PUTCHAR('f'); /* FF */
+ break;
+
+ case '\n':
+ FMTBUF_PUTCHAR('n'); /* NL */
+ break;
+
+ case '\r':
+ FMTBUF_PUTCHAR('r'); /* CR */
+ break;
+
+ case '\t':
+ FMTBUF_PUTCHAR('t'); /* tab */
+ break;
+
+ case '\v':
+ FMTBUF_PUTCHAR('v');
+ break;
+
+ default:
+ /*
+ * We've already put the backslash, but this
+ * will put 3 more characters for the octal
+ * number; make sure we have enough room for
+ * that, plus the trailing '\0'.
+ */
+ FMTBUF_EXPAND(3);
+ FMTBUF_PUTBYTE_OCTAL(c);
+ break;
+ }
+ } else {
+ /*
+ * Unicode, but not printable, and not ASCII;
+ * put it out as \uxxxx or \Uxxxxxxxx.
+ */
+ if (uc <= 0xFFFF) {
+ FMTBUF_EXPAND(6);
+ FMTBUF_PUTCHAR('\\');
+ FMTBUF_PUTCHAR('u');
+ FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
+ } else {
+ FMTBUF_EXPAND(10);
+ FMTBUF_PUTCHAR('\\');
+ FMTBUF_PUTCHAR('U');
+ FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
+ }
+ }
}
}
- fmtbuf[column] = '\0';
+
+ FMTBUF_ENDSTR;
return fmtbuf;
}
-
/*
* Given a string, generate a string from it that shows non-printable
* characters as C-style escapes except a whitespace character
@@ -231,93 +495,296 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
gchar *
format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
{
- gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
- int fmtbuf_len = INITIAL_FMTBUF_SIZE;
- int column;
+ FMTBUF_VARS;
const guchar *stringend = string + len;
guchar c;
- int i;
- column = 0;
while (string < stringend) {
/*
- * Is there enough room for this character, if it expands to
- * a backslash plus 3 octal digits (which is the most it can
- * expand to), and also enough room for a terminating '\0'?
+ * Get the first byte of this character.
*/
- if (column+3+1 >= fmtbuf_len) {
- /*
- * Double the buffer's size if it's not big enough.
- * The size of the buffer starts at 128, so doubling its size
- * adds at least another 128 bytes, which is more than enough
- * for one more character plus a terminating '\0'.
- */
- fmtbuf_len *= 2;
- fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
- }
c = *string++;
-
if (g_ascii_isprint(c)) {
- fmtbuf[column] = c;
- column++;
+ /*
+ * Printable ASCII, so not part of a multi-byte UTF-8 sequence.
+ * Make sure there's enough room for one more byte, and add
+ * the character.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(c);
} else if (g_ascii_isspace(c)) {
- fmtbuf[column] = ' ';
- column++;
- } else {
- fmtbuf[column] = '\\';
- column++;
+ /*
+ * ASCII, so not part of a multi-byte UTF-8 sequence, but
+ * not printable, but is a space character; show it as a
+ * blank.
+ *
+ * Make sure there's enough room for one more byte, and add
+ * the blank.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(' ');
+ } else if (c < 128) {
+ /*
+ * ASCII, so not part of a multi-byte UTF-8 sequence, but not
+ * printable.
+ *
+ * That requires a minimum of 2 bytes, one for the backslash
+ * and one for a letter, so make sure we have enough room
+ * for that, plus a trailing '\0'.
+ */
+ FMTBUF_EXPAND(2);
+ FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
- fmtbuf[column] = 'a';
- column++;
+ FMTBUF_PUTCHAR('a');
break;
case '\b':
- fmtbuf[column] = 'b'; /* BS */
- column++;
+ FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
- fmtbuf[column] = 'f'; /* FF */
- column++;
+ FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
- fmtbuf[column] = 'n'; /* NL */
- column++;
+ FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
- fmtbuf[column] = 'r'; /* CR */
- column++;
+ FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
- fmtbuf[column] = 't'; /* tab */
- column++;
+ FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
- fmtbuf[column] = 'v';
- column++;
+ FMTBUF_PUTCHAR('v');
break;
default:
- i = (c>>6)&03;
- fmtbuf[column] = i + '0';
- column++;
- i = (c>>3)&07;
- fmtbuf[column] = i + '0';
- column++;
- i = (c>>0)&07;
- fmtbuf[column] = i + '0';
- column++;
+ /*
+ * We've already put the backslash, but this
+ * will put 3 more characters for the octal
+ * number; make sure we have enough room for
+ * that, plus the trailing '\0'.
+ */
+ FMTBUF_EXPAND(3);
+ FMTBUF_PUTBYTE_OCTAL(c);
break;
}
+ } else {
+ /*
+ * We've fetched the first byte of a multi-byte UTF-8
+ * sequence into c.
+ */
+ int utf8_len;
+ guchar mask;
+ gunichar uc;
+ guchar first;
+
+ if ((c & 0xe8) == 0xc0) {
+ /* Starts a 2-byte UTF-8 sequence; 1 byte left */
+ utf8_len = 1;
+ mask = 0x1f;
+ } else if ((c & 0xf0) == 0xe0) {
+ /* Starts a 3-byte UTF-8 sequence; 2 bytes left */
+ utf8_len = 2;
+ mask = 0x0f;
+ } else if ((c & 0xf8) == 0xf0) {
+ /* Starts a 4-byte UTF-8 sequence; 3 bytes left */
+ utf8_len = 3;
+ mask = 0x07;
+ } else if ((c & 0xfc) == 0xf8) {
+ /* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
+ utf8_len = 4;
+ mask = 0x03;
+ } else if ((c & 0xfe) == 0xfc) {
+ /* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
+ utf8_len = 5;
+ mask = 0x01;
+ } else {
+ /* 0xfe or 0xff - not valid */
+ utf8_len = -1;
+ }
+ if (utf8_len > 0) {
+ /* Try to construct the Unicode character */
+ uc = c & mask;
+ for (int i = 0; i < utf8_len; i++) {
+ if (string >= stringend) {
+ /*
+ * Ran out of octets, so the character is
+ * incomplete. Put in a REPLACEMENT CHARACTER
+ * instead, and then continue the loop, which
+ * will terminate.
+ */
+ uc = UNREPL;
+ break;
+ }
+ c = *string;
+ if ((c & 0xc0) != 0x80) {
+ /*
+ * Not valid UTF-8 continuation character; put in
+ * a replacement character, and then re-process
+ * this octet as the beginning of a new character.
+ */
+ uc = UNREPL;
+ break;
+ }
+ string++;
+ uc = (uc << 6) | (c & 0x3f);
+ }
+
+ /*
+ * If this isn't a valid Unicode character, put in
+ * a REPLACEMENT CHARACTER.
+ */
+ if (!g_unichar_validate(uc))
+ uc = UNREPL;
+ } else {
+ /* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
+ uc = UNREPL;
+ }
+
+ /*
+ * OK, is it a printable Unicode character?
+ */
+ if (g_unichar_isprint(uc)) {
+ /*
+ * Yes - put it into the string as UTF-8.
+ * This means that if it was an overlong
+ * encoding, this will put out the right
+ * sized encoding.
+ */
+ if (uc < 0x80) {
+ first = 0;
+ utf8_len = 1;
+ } else if (uc < 0x800) {
+ first = 0xc0;
+ utf8_len = 2;
+ } else if (uc < 0x10000) {
+ first = 0xe0;
+ utf8_len = 3;
+ } else if (uc < 0x200000) {
+ first = 0xf0;
+ utf8_len = 4;
+ } else if (uc < 0x4000000) {
+ /*
+ * This should never happen, as Unicode doesn't
+ * go that high.
+ */
+ first = 0xf8;
+ utf8_len = 5;
+ } else {
+ /*
+ * This should never happen, as Unicode doesn't
+ * go that high.
+ */
+ first = 0xfc;
+ utf8_len = 6;
+ }
+ FMTBUF_EXPAND(utf8_len);
+ for (int i = utf8_len - 1; i > 0; i--) {
+ fmtbuf[column + i] = (uc & 0x3f) | 0x80;
+ uc >>= 6;
+ }
+ fmtbuf[column] = uc | first;
+ column += utf8_len;
+ } else if (g_unichar_isspace(uc)) {
+ /*
+ * Not printable, but is a space character; show it
+ * as a blank.
+ *
+ * Make sure there's enough room for one more byte,
+ * and add the blank.
+ */
+ FMTBUF_EXPAND(1);
+ FMTBUF_PUTCHAR(' ');
+ } else if (c < 128) {
+ /*
+ * ASCII, but not printable.
+ * Yes, this could happen with an overlong encoding.
+ *
+ * That requires a minimum of 2 bytes, one for the
+ * backslash and one for a letter, so make sure we
+ * have enough room for that, plus a trailing '\0'.
+ */
+ FMTBUF_EXPAND(2);
+ FMTBUF_PUTCHAR('\\');
+ switch (c) {
+
+ case '\a':
+ FMTBUF_PUTCHAR('a');
+ break;
+
+ case '\b':
+ FMTBUF_PUTCHAR('b'); /* BS */
+ break;
+
+ case '\f':
+ FMTBUF_PUTCHAR('f'); /* FF */
+ break;
+
+ case '\n':
+ FMTBUF_PUTCHAR('n'); /* NL */
+ break;
+
+ case '\r':
+ FMTBUF_PUTCHAR('r'); /* CR */
+ break;
+
+ case '\t':
+ FMTBUF_PUTCHAR('t'); /* tab */
+ break;
+
+ case '\v':
+ FMTBUF_PUTCHAR('v');
+ break;
+
+ default:
+ /*
+ * We've already put the backslash, but this
+ * will put 3 more characters for the octal
+ * number; make sure we have enough room for
+ * that, plus the trailing '\0'.
+ */
+ FMTBUF_EXPAND(3);
+ FMTBUF_PUTBYTE_OCTAL(c);
+ break;
+ }
+ } else {
+ /*
+ * Unicode, but not printable, and not ASCII;
+ * put it out as \uxxxx or \Uxxxxxxxx.
+ */
+ if (uc <= 0xFFFF) {
+ FMTBUF_EXPAND(6);
+ FMTBUF_PUTCHAR('\\');
+ FMTBUF_PUTCHAR('u');
+ FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
+ } else {
+ FMTBUF_EXPAND(10);
+ FMTBUF_PUTCHAR('\\');
+ FMTBUF_PUTCHAR('U');
+ FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
+ FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
+ }
+ }
}
}
- fmtbuf[column] = '\0';
+
+ FMTBUF_ENDSTR;
return fmtbuf;
}
@@ -326,53 +793,37 @@ format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
* characters as the chr parameter passed, except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
+ *
+ * This does *not* treat the input string as UTF-8.
+ *
+ * XXX - is there any reason to use this?
*/
gchar *
format_text_chr(wmem_allocator_t* allocator, const guchar *string, const size_t len, const guchar chr)
{
- gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
- int fmtbuf_len = INITIAL_FMTBUF_SIZE;
- int column;
+ FMTBUF_VARS;
const guchar *stringend = string + len;
guchar c;
- column = 0;
while (string < stringend)
{
- /*
- * Is there enough room for this character,
- * and also enough room for a terminating '\0'?
- */
- if (column+1 >= fmtbuf_len)
- {
- /*
- * Double the buffer's size if it's not big enough.
- * The size of the buffer starts at 128, so doubling its size
- * adds at least another 128 bytes, which is more than enough
- * for one more character plus a terminating '\0'.
- */
- fmtbuf_len *= 2;
- fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
- }
+ FMTBUF_EXPAND(1);
c = *string++;
if (g_ascii_isprint(c))
{
- fmtbuf[column] = c;
- column++;
+ FMTBUF_PUTCHAR(c);
}
else if (g_ascii_isspace(c))
{
- fmtbuf[column] = ' ';
- column++;
+ FMTBUF_PUTCHAR(' ');
}
else
{
- fmtbuf[column] = chr;
- column++;
+ FMTBUF_PUTCHAR(chr);
}
}
- fmtbuf[column] = '\0';
+ FMTBUF_ENDSTR;
return fmtbuf;
}
@@ -680,12 +1131,11 @@ uri_str_to_bytes(const char *uri_str, GByteArray *bytes)
gchar *
format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *reserved_chars)
{
- gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
- guint fmtbuf_len = INITIAL_FMTBUF_SIZE;
- static const guchar *reserved_def = ":/?#[]@!$&'()*+,;= ";
+ FMTBUF_VARS;
+ static const guchar reserved_def[] = ":/?#[]@!$&'()*+,;= ";
const guchar *reserved = reserved_def;
guint8 c;
- guint byte_index, column, i;
+ guint byte_index, i;
gboolean is_reserved = FALSE;
if (! bytes)
@@ -694,23 +1144,13 @@ format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *re
if (reserved_chars)
reserved = reserved_chars;
- column = 0;
for (byte_index = 0; byte_index < bytes->len; byte_index++) {
/*
- * Is there enough room for this character, if it expands to
- * a percent plus 2 hex digits (which is the most it can
- * expand to), and also enough room for a terminating '\0'?
+ * Make sure there is enough room for this character, if it
+ * expands to a percent plus 2 hex digits (which is the most
+ * it can expand to), and also enough room for a terminating '\0'.
*/
- if (column+2+1 >= fmtbuf_len) {
- /*
- * Double the buffer's size if it's not big enough.
- * The size of the buffer starts at 128, so doubling its size
- * adds at least another 128 bytes, which is more than enough
- * for one more character plus a terminating '\0'.
- */
- fmtbuf_len *= 2;
- fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
- }
+ FMTBUF_EXPAND(2);
c = bytes->data[byte_index];
is_reserved = FALSE;
@@ -724,15 +1164,11 @@ format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *re
}
if (!is_reserved) {
- fmtbuf[column] = c;
- column++;
+ FMTBUF_PUTCHAR(c);
} else {
- fmtbuf[column] = '%';
- column++;
- fmtbuf[column] = hex[c >> 4];
- column++;
- fmtbuf[column] = hex[c & 0xF];
- column++;
+ FMTBUF_PUTCHAR('%');
+ FMTBUF_PUTCHAR(hex[c >> 4]);
+ FMTBUF_PUTCHAR(hex[c & 0xF]);
}
}
fmtbuf[column] = '\0';