diff options
-rw-r--r-- | epan/tvbuff.c | 222 | ||||
-rw-r--r-- | epan/tvbuff.h | 17 |
2 files changed, 137 insertions, 102 deletions
diff --git a/epan/tvbuff.c b/epan/tvbuff.c index dc8e721b67..e9d23a5302 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -1850,23 +1850,29 @@ tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset, const gint size) #define UNREPL 0x00FFFD /* - * Given a tvbuff, an offset, and a length, allocate a buffer big enough - * to hold a string of length characters plus a trailing '\0'. Copy length - * characters, starting at offset, from the tvbuff into the buffer and return - * a pointer to the buffer. - * Characters with the highest bit set will be converted to the Unicode - * Replacement Character. The resulting buffer contains a valid UTF-8 - * string of length+1 characters (not necessarily length+1 bytes since - * the replacement char is two bytes long). + * All string functions below take a scope as an argument. + * * * If scope is NULL, memory is allocated with g_malloc() and user must * explicitly free it with g_free(). * If scope is not NULL, memory is allocated with the corresponding pool * lifetime. - * Throws an exception if the tvbuff ends before the string does. + * + * All functions throw an exception if the tvbuff ends before the string + * does. */ -guint8 * -tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) + +/* + * Given a tvbuff, an offset, and a length, treat the string of bytes + * referred to by them as an ASCII string, with all bytes with the + * high-order bit set being invalid, and return a pointer to a + * UTF-8 string. + * + * Octets with the highest bit set will be converted to the Unicode + * REPLACEMENT CHARACTER. + */ +static guint8 * +tvb_get_ascii_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) { wmem_strbuf_t *str; @@ -1879,9 +1885,8 @@ tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) if (ch < 0x80) wmem_strbuf_append_c(str, ch); - else { + else wmem_strbuf_append_unichar(str, UNREPL); - } offset++; length--; } @@ -1892,6 +1897,31 @@ tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) return (guint8 *) wmem_strbuf_get_str(str); } +/* + * Given a tvbuff, an offset, and a length, treat the string of bytes + * referred to by them as a UTF-8 string, and return a pointer to that + * string. + * + * XXX - should map invalid UTF-8 sequences to UNREPL. + */ +static guint8 * +tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length) +{ + guint8 *strbuf; + + tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */ + strbuf = (guint8 *)wmem_alloc(scope, length + 1); + tvb_memcpy(tvb, strbuf, offset, length); + strbuf[length] = '\0'; + return strbuf; +} + +/* + * Given a tvbuff, an offset, and a length, treat the string of bytes + * referred to by them as an ISO 8859/1 string, with all bytes with the + * high-order bit set being invalid, and return a pointer to a UTF-8 + * string. + */ static guint8 * tvb_get_string_8859_1(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) { @@ -1922,11 +1952,13 @@ tvb_get_string_8859_1(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint } /* - * Given a string encoded using octet per character, with octets with - * the high-order bit clear being ASCII, and a translation table that - * maps values for other octets to 2-byte Unicode Basic Multilingual - * Plane characters (including REPLACEMENT CHARACTER), return a UTF-8 - * string with the same characters. + * Given a tvbuff, an offset, and a length, and a translation table, + * treat the string of bytes referred to by them as a string encoded + * using one octet per character, with octets with the high-order bit + * clear being ASCII and octets with the high-order bit set being + * mapped by the translation table to 2-byte Unicode Basic Multilingual + * Plane characters (including REPLACEMENT CHARACTER), and return a + * pointer to a UTF-8 string. */ static guint8 * tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length, const gunichar2 table[0x80]) @@ -1951,18 +1983,14 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin } /* - * Given a UCS-2 encoded string containing characters from the - * Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8 - * string with the same characters. + * Given a tvbuff, and offset, and a length, treat the string of bytes + * referred to by them as a UCS-2 encoded string containing characters + * from the Basic Multilingual Plane (plane 0) of Unicode, return a + * pointer to a UTF-8 string. * - * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * - * Specify length in bytes - * - * If scope is NULL, memory is allocated with g_malloc() and user must - * explicitly free it with g_free(). - * If scope is not NULL, memory is allocated with the corresponding pool - * lifetime. + * Specify length in bytes. * * XXX - should map lead and trail surrogate values to REPLACEMENT * CHARACTERs (0xFFFD)? @@ -2006,24 +2034,19 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, } /* - * Given a UTF-16 encoded Unicode string, return a UTF-8 string with the - * same characters. + * Given a tvbuff, and offset, and a length, treat the string of bytes + * referred to by them as a UTF-16 encoded string, return a pointer to + * a UTF-8 string. * - * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN - * - * Specify length in bytes + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * - * If scope is NULL, memory is allocated with g_malloc() and user must - * explicitly free it with g_free(). - * If scope is not NULL, memory is allocated with the corresponding pool - * lifetime. + * Specify length in bytes. * * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD). * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs. * XXX - if there are an odd number of bytes, should put a * REPLACEMENT CHARACTER at the end. */ - static wmem_strbuf_t * tvb_extract_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint size, const guint encoding) { @@ -2113,18 +2136,14 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, } /* - * Given a UCS-4-encoded Unicode string, return a UTF-8 string with the - * same characters. + * Given a tvbuff, and offset, and a length, treat the string of bytes + * referred to by them as a UCS-4 encoded string, return a pointer to + * a UTF-8 string. * * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * * Specify length in bytes * - * If scope is NULL, memory is allocated with g_malloc() and user must - * explicitly free it with g_free(). - * If scope is not NULL, memory is allocated with the corresponding pool - * lifetime. - * * XXX - should map lead and trail surrogate values to a "substitute" * UTF-8 character? * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs. @@ -2347,13 +2366,6 @@ tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, tvbuff_t *tvb, * at that offset, plus a trailing '\0', copy into the buffer the * string as converted from the appropriate encoding to UTF-8, and * return a pointer to the string. - * - * Throws an exception if the tvbuff ends before the string does. - * - * If scope is NULL, memory is allocated with g_malloc() and user must - * explicitly free it with g_free(). - * If scope is not NULL, memory is allocated with the corresponding pool - * lifetime. */ guint8 * tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, @@ -2375,7 +2387,7 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, * encoding value, and passed non-zero values * other than TRUE to mean "little-endian". */ - strbuf = tvb_get_string(scope, tvb, offset, length); + strbuf = tvb_get_ascii_string(scope, tvb, offset, length); break; case ENC_UTF_8: @@ -2385,7 +2397,7 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, * XXX - should map code points > 10FFFF to REPLACEMENT * CHARACTERs. */ - strbuf = tvb_get_string(scope, tvb, offset, length); + strbuf = tvb_get_utf_8_string(scope, tvb, offset, length); break; case ENC_UTF_16: @@ -2500,20 +2512,54 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, } /* - * Given a tvbuff and an offset, with the offset assumed to refer to - * a null-terminated string, find the length of that string (and throw - * an exception if the tvbuff ends before we find the null), allocate - * a buffer big enough to hold the string, copy the string into it, - * and return a pointer to the string. Also return the length of the - * string (including the terminating null) through a pointer. - * - * If scope is NULL, memory is allocated with g_malloc() and user must - * explicitly free it with g_free(). - * If scope is not NULL, memory is allocated with the corresponding pool - * lifetime. + * Get an ASCII string; this should not be used in new code. */ guint8 * -tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp) +tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, + const gint length) +{ + return tvb_get_ascii_string(scope, tvb, offset, length); +} + +/* + * These routines are like the above routines, except that they handle + * null-terminated strings. They find the length of that string (and + * throw an exception if the tvbuff ends before we find the null), and + * also return through a pointer the length of the string, in bytes, + * including the terminating null (the terminating null being 2 bytes + * for UCS-2 and UTF-16, 4 bytes for UCS-4, and 1 byte for other + * encodings). + */ +static guint8 * +tvb_get_ascii_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint *lengthp) +{ + guint size, i; + wmem_strbuf_t *str; + + str = wmem_strbuf_new(scope, ""); + + size = tvb_strsize(tvb, offset); + for (i = 0; i < size; i++) { + guint8 ch = tvb_get_guint8(tvb, offset); + + if (ch < 0x80) + wmem_strbuf_append_c(str, ch); + else + wmem_strbuf_append_unichar(str, UNREPL); + offset++; + } + /* No need to append '\0' - we processed the NUL in the loop above. */ + + if (lengthp) + *lengthp = size; + + /* XXX, discarding constiness, should we have some function which "take-over" strbuf->str + (like when strbuf is no longer needed) */ + return (guint8 *) wmem_strbuf_get_str(str); +} + +static guint8 * +tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp) { guint size; guint8 *strptr; @@ -2574,15 +2620,6 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp) return strptr; } -/* - * Version of tvb_get_stringz() that handles the Basic Multilingual Plane - * (plane 0) of Unicode, with each code point encoded in 16 bits. - * - * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. - * - * Returns an allocated UTF-8 string and updates lengthp pointer with - * length of string (in bytes), including the terminating (2-byte) NUL. - */ static gchar * tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { @@ -2600,14 +2637,6 @@ tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, return (gchar*)wmem_strbuf_get_str(strbuf); } -/* - * Version of tvb_get_stringz() that handles UTF-16. - * - * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. - * - * Returns an allocated UTF-8 string and updates lengthp pointer with - * length of string (in bytes), including the terminating (2-byte) NUL. - */ static gchar * tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { @@ -2625,14 +2654,6 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset return (gchar*)wmem_strbuf_get_str(strbuf); } -/* - * Version of tvb_get_stringz() that handles UCS-4. - * - * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. - * - * Returns an allocated UTF-8 string and updates lengthp pointer with - * length of string (in bytes), including the terminating (4-byte) NUL. - */ static gchar * tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { @@ -2676,19 +2697,18 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g * was a gboolean for the byte order, not an * encoding value, and passed non-zero values * other than TRUE to mean "little-endian". - * - * XXX - should map all octets with the 8th bit - * not set to a "substitute" UTF-8 character. */ - strptr = tvb_get_stringz(scope, tvb, offset, lengthp); + strptr = tvb_get_ascii_stringz(scope, tvb, offset, lengthp); break; case ENC_UTF_8: /* * XXX - should map all invalid UTF-8 sequences * to a "substitute" UTF-8 character. + * XXX - should map code points > 10FFFF to REPLACEMENT + * CHARACTERs. */ - strptr = tvb_get_stringz(scope, tvb, offset, lengthp); + strptr = tvb_get_utf_8_stringz(scope, tvb, offset, lengthp); break; case ENC_UTF_16: @@ -2797,6 +2817,16 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g return strptr; } +/* + * Get an ASCII string; this should not be used in new code. + */ +guint8 * +tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, + gint *lengthp) +{ + return tvb_get_ascii_stringz(scope, tvb, offset, lengthp); +} + /* Looks for a stringz (NUL-terminated string) in tvbuff and copies * no more than bufsize number of bytes, including terminating NUL, to buffer. * Returns length of string (not including terminating NUL), or -1 if the string was diff --git a/epan/tvbuff.h b/epan/tvbuff.h index 2ea913a3f3..e8b9049695 100644 --- a/epan/tvbuff.h +++ b/epan/tvbuff.h @@ -485,11 +485,13 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset, * * Throws an exception if the tvbuff ends before the string does. * - * tvb_get_string() handles 7bit ASCII strings, 8bit characters are - * converted into the Unicode Replacement Character. + * tvb_get_string() handles 7-bit ASCII strings, with characters + * with the 8th bit set are converted to the + * Unicode REPLACEMENT CHARACTER. * * tvb_get_string_enc() takes a string encoding as well, and converts to UTF-8 - * from the encoding. + * from the encoding, possibly mapping some characters + * to the REPLACEMENT CHARACTER. * * If scope is set to NULL it is the user's responsibility to g_free() * the memory allocated by tvb_memdup(). Otherwise memory is @@ -522,10 +524,13 @@ WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, * and return a pointer to the string. Also return the length of the * string (including the terminating null) through a pointer. * - * tvb_get_stringz() returns a string + * tvb_get_stringz() handles 7-bit ASCII strings, with characters + * with the 8th bit set are converted to the + * Unicode REPLACEMENT CHARACTER. * - * tvb_get_stringz_enc() takes a string encoding as well, and converts to - * UTF-8 from the encoding. + * tvb_get_stringz_enc() takes a string encoding as well, and converts to UTF-8 + * from the encoding, possibly mapping some characters + * to the REPLACEMENT CHARACTER. * * tvb_get_const_stringz() returns a constant (unmodifiable) string that does * not need to be freed, instead it will automatically be |