diff options
-rw-r--r-- | epan/dissectors/packet-bacapp.c | 84 | ||||
-rw-r--r-- | epan/proto.h | 42 | ||||
-rw-r--r-- | epan/tvbuff.c | 176 |
3 files changed, 159 insertions, 143 deletions
diff --git a/epan/dissectors/packet-bacapp.c b/epan/dissectors/packet-bacapp.c index d8ddddd3a8..2382b9fd7c 100644 --- a/epan/dissectors/packet-bacapp.c +++ b/epan/dissectors/packet-bacapp.c @@ -2310,22 +2310,6 @@ proto_register_bacapp(void); void proto_reg_handoff_bacapp(void); -/** - * converts XXX coded strings to UTF-8 - * else 'in' is copied to 'out' - * @param in -- pointer to string - * @param inbytesleft size of int bytes - * @param out -- pointer to string - * @param outbytesleft size of out bytes - * @param fromcoding coding type - * @return count of modified characters of returned string, -1 for errors - */ -static guint32 -fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding); - -static void -uni_to_string(char * data, gsize str_length, char *dest_buf); - /* <<<< formerly bacapp.h */ /* reassembly table for segmented messages */ @@ -6245,9 +6229,8 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs { guint8 tag_no, tag_info, character_set; guint32 lvt, l; - gsize inbytesleft, outbytesleft = 512; + gsize inbytesleft; guint offs, extra = 1; - guint8 *str_val; const char *coding; guint8 bf_arr[512], *out = &bf_arr[0]; proto_item *ti; @@ -6299,8 +6282,7 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs coding = "JIS C 6226"; break; case ISO_10646_UCS4: - str_val = tvb_get_string(wmem_packet_scope(), tvb, offset, l); - fConvertXXXtoUTF8(str_val, &inbytesleft, out, &outbytesleft, "UCS-4BE"); + out = tvb_get_string_enc(wmem_packet_scope(), tvb, offset, l, ENC_UCS_4|ENC_BIG_ENDIAN); coding = "ISO 10646 UCS-4"; break; case ISO_10646_UCS2: @@ -11040,68 +11022,6 @@ bacapp_init_routine(void) &addresses_reassembly_table_functions); } -static guint32 -fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding) -{ - guint32 i; - GIConv icd; - - if ((icd = g_iconv_open("UTF-8", fromcoding)) != (GIConv) -1) { - i = (guint32) g_iconv(icd, &in, inbytesleft, &out, outbytesleft); - /* g_iconv incremented 'out'; now ensure it's NULL terminated */ - out[0] = '\0'; - - g_iconv_close(icd); - return i; - } - - uni_to_string(in, *inbytesleft, out); - out[*inbytesleft] = '\0'; - *outbytesleft -= *inbytesleft; - *inbytesleft = 0; - - return 0; -} - -static void -uni_to_string(char * data, gsize str_length, char *dest_buf) -{ - gint i; - guint16 c_char; - gsize length_remaining; - - length_remaining = str_length; - dest_buf[0] = '\0'; - if (str_length == 0) { - return; - } - for ( i = 0; i < (gint) str_length; i++ ) { - c_char = data[i]; - if ((c_char < 0x20) || (c_char > 0x7e)) { - if (c_char != 0x00) { - c_char = '.'; - dest_buf[i] = c_char & 0xff; - } else { - i--; - str_length--; - } - } else { - dest_buf[i] = c_char & 0xff; - } - length_remaining--; - - if (length_remaining == 0) { - dest_buf[i+1] = '\0'; - return; - } - } - if (i < 0) { - i = 0; - } - dest_buf[i] = '\0'; - return; -} - void proto_register_bacapp(void) { diff --git a/epan/proto.h b/epan/proto.h index 8a299e397b..2c29514a73 100644 --- a/epan/proto.h +++ b/epan/proto.h @@ -270,10 +270,6 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa * For UTF-8, invalid UTF-8 sequences should be mapped to the same * code point. * - * We also don't process UTF-16 or UCS-2 differently - we don't - * handle surrogate pairs, and don't handle 2-byte values that - * aren't valid in UTF-16 or UCS-2 strings. - * * For display, perhaps we should also map control characters to the * Unicode glyphs showing the name of the control character in small * caps, diagonally. (Unfortunately, those only exist for C0, not C1.) @@ -283,31 +279,31 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa #define ENC_UTF_8 0x00000002 #define ENC_UTF_16 0x00000004 #define ENC_UCS_2 0x00000006 -#define ENC_EBCDIC 0x00000008 -#define ENC_WINDOWS_1250 0x0000000A -#define ENC_ISO_8859_1 0x0000000C -#define ENC_ISO_8859_2 0x0000000E -#define ENC_ISO_8859_3 0x00000010 -#define ENC_ISO_8859_4 0x00000012 -#define ENC_ISO_8859_5 0x00000014 -#define ENC_ISO_8859_6 0x00000016 -#define ENC_ISO_8859_7 0x00000018 -#define ENC_ISO_8859_8 0x0000001A -#define ENC_ISO_8859_9 0x0000001C -#define ENC_ISO_8859_10 0x0000001E -#define ENC_ISO_8859_11 0x00000020 -/* #define ENC_ISO_8859_12 0x00000022 ISO 8859-12 was abandoned */ -#define ENC_ISO_8859_13 0x00000024 -#define ENC_ISO_8859_14 0x00000026 -#define ENC_ISO_8859_15 0x00000028 -#define ENC_ISO_8859_16 0x0000002A +#define ENC_UCS_4 0x00000008 +#define ENC_ISO_8859_1 0x0000000A +#define ENC_ISO_8859_2 0x0000000C +#define ENC_ISO_8859_3 0x0000000E +#define ENC_ISO_8859_4 0x00000010 +#define ENC_ISO_8859_5 0x00000012 +#define ENC_ISO_8859_6 0x00000014 +#define ENC_ISO_8859_7 0x00000016 +#define ENC_ISO_8859_8 0x00000018 +#define ENC_ISO_8859_9 0x0000001A +#define ENC_ISO_8859_10 0x0000001C +#define ENC_ISO_8859_11 0x0000001E +/* #define ENC_ISO_8859_12 0x00000020 ISO 8859-12 was abandoned */ +#define ENC_ISO_8859_13 0x00000022 +#define ENC_ISO_8859_14 0x00000024 +#define ENC_ISO_8859_15 0x00000026 +#define ENC_ISO_8859_16 0x00000028 +#define ENC_WINDOWS_1250 0x0000002A +#define ENC_EBCDIC 0x0000002C /* * TODO: * * These could probably be used by existing code: * - * ENC_UCS_4 - UCS-4 * - "IBM MS DBCS" * - JIS C 6226 * 7-bit encodings such as ETSI 03.38 (GSM SMS character set diff --git a/epan/tvbuff.c b/epan/tvbuff.c index c7846f05ce..442c0e6ed2 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -1924,7 +1924,7 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin * Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8 * string with the same characters. * - * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * * Specify length in bytes * @@ -1967,7 +1967,7 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, * Given a UTF-16 encoded Unicode string, return a UTF-8 string with the * same characters. * - * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * * Specify length in bytes * @@ -2074,6 +2074,50 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, } /* + * Given a UCS-4-encoded Unicode string, return a UTF-8 string with the + * same characters. + * + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * + * Specify length in bytes + * + * If scope is NULL, memory is allocated with g_malloc() and user must + * explicitely free it with g_free(). + * If scope is not NULL, memory is allocated with the corresponding pool + * lifetime. + * + * XXX - should map lead and trail surrogate values, and code points beyond + * the maximum Unicode character, to a "substitute" UTF-8 character? + */ +static gchar * +tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding) +{ + gunichar uchar; + gint i; /* Byte counter for tvbuff */ + wmem_strbuf_t *strbuf; + + tvb_ensure_bytes_exist(tvb, offset, length); + + strbuf = wmem_strbuf_new(scope, NULL); + + for(i = 0; i + 3 < length; i += 2) { + if (encoding == ENC_BIG_ENDIAN) + uchar = tvb_get_ntohl(tvb, offset + i); + else + uchar = tvb_get_letohl(tvb, offset + i); + + wmem_strbuf_append_unichar(strbuf, uchar); + } + + /* + * XXX - if i < length, this means we were handed a number + * of bytes that's not a multiple of 4, so we're not a valid + * UCS-4 string. + */ + return (gchar*)wmem_strbuf_get_str(strbuf); +} + +/* * Given a tvbuff, an offset, a length, and an encoding, allocate a * buffer big enough to hold a non-null-terminated string of that length * at that offset, plus a trailing '\0', copy into the buffer the @@ -2131,24 +2175,9 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, encoding & ENC_LITTLE_ENDIAN); break; - case ENC_EBCDIC: - /* - * XXX - do the copy and conversion in one pass. - * - * XXX - multiple "dialects" of EBCDIC? - */ - tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */ - strbuf = (guint8 *)wmem_alloc(scope, length + 1); - if (length != 0) { - ptr = ensure_contiguous(tvb, offset, length); - memcpy(strbuf, ptr, length); - EBCDIC_to_ASCII(strbuf, length); - } - strbuf[length] = '\0'; - break; - - case ENC_WINDOWS_1250: - strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250); + case ENC_UCS_4: + strbuf = tvb_get_ucs_4_string(scope, tvb, offset, length, + encoding & ENC_LITTLE_ENDIAN); break; case ENC_ISO_8859_1: @@ -2215,6 +2244,26 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, case ENC_ISO_8859_16: strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_iso_8859_16); break; + + case ENC_WINDOWS_1250: + strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250); + break; + + case ENC_EBCDIC: + /* + * XXX - do the copy and conversion in one pass. + * + * XXX - multiple "dialects" of EBCDIC? + */ + tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */ + strbuf = (guint8 *)wmem_alloc(scope, length + 1); + if (length != 0) { + ptr = ensure_contiguous(tvb, offset, length); + memcpy(strbuf, ptr, length); + EBCDIC_to_ASCII(strbuf, length); + } + strbuf[length] = '\0'; + break; } return strbuf; } @@ -2298,7 +2347,7 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp) * Version of tvb_get_stringz() that handles the Basic Multilingual Plane * (plane 0) of Unicode, with each code point encoded in 16 bits. * - * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * * Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes) * @@ -2310,7 +2359,7 @@ static gchar * tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { gunichar2 uchar; - gint size; /* Number of UTF-16 characters */ + gint size; /* Number of bytes in string */ gint i; /* Byte counter for tvbuff */ wmem_strbuf_t *strbuf; @@ -2357,6 +2406,52 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset return (gchar*)wmem_strbuf_get_str(strbuf); } +/* + * Version of tvb_get_stringz() that handles Unicode, with each code point + * encoded in 32 bits. + * + * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN + * + * Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes) + * + * XXX - needs to map values that are not valid Unicode characters (such as, + * I think, values used as the components of a UTF-16 surrogate pair) to a + * "substitute" UTF-8 character. + */ +static gchar * +tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) +{ + gunichar uchar; + gint size; /* Number of bytes in string */ + gint i; /* Byte counter for tvbuff */ + wmem_strbuf_t *strbuf; + + DISSECTOR_ASSERT(tvb && tvb->initialized); + + size = 0; + do { + /* Endianness doesn't matter when looking for null */ + uchar = tvb_get_ntohl(tvb, offset + size); + size += 4; + } while(uchar != 0); + + strbuf = wmem_strbuf_new(scope, NULL); + + for(i = 0; i < size; i += 4) { + if (encoding == ENC_BIG_ENDIAN) + uchar = tvb_get_ntohl(tvb, offset + i); + else + uchar = tvb_get_letohl(tvb, offset + i); + + wmem_strbuf_append_unichar(strbuf, uchar); + } + + if (lengthp) + *lengthp = i; /* Number of *bytes* processed */ + + return (gchar*)wmem_strbuf_get_str(strbuf); +} + guint8 * tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { @@ -2400,22 +2495,9 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g encoding & ENC_LITTLE_ENDIAN); break; - case ENC_EBCDIC: - /* - * XXX - do the copy and conversion in one pass. - * - * XXX - multiple "dialects" of EBCDIC? - */ - size = tvb_strsize(tvb, offset); - strptr = (guint8 *)wmem_alloc(scope, size); - tvb_memcpy(tvb, strptr, offset, size); - EBCDIC_to_ASCII(strptr, size); - if (lengthp) - *lengthp = size; - break; - - case ENC_WINDOWS_1250: - strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250); + case ENC_UCS_4: + strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp, + encoding & ENC_LITTLE_ENDIAN); break; case ENC_ISO_8859_1: @@ -2482,6 +2564,24 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g case ENC_ISO_8859_16: strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_iso_8859_16); break; + + case ENC_WINDOWS_1250: + strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250); + break; + + case ENC_EBCDIC: + /* + * XXX - do the copy and conversion in one pass. + * + * XXX - multiple "dialects" of EBCDIC? + */ + size = tvb_strsize(tvb, offset); + strptr = (guint8 *)wmem_alloc(scope, size); + tvb_memcpy(tvb, strptr, offset, size); + EBCDIC_to_ASCII(strptr, size); + if (lengthp) + *lengthp = size; + break; } return strptr; |