aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--epan/dissectors/packet-bacapp.c84
-rw-r--r--epan/proto.h42
-rw-r--r--epan/tvbuff.c176
3 files changed, 159 insertions, 143 deletions
diff --git a/epan/dissectors/packet-bacapp.c b/epan/dissectors/packet-bacapp.c
index d8ddddd3a8..2382b9fd7c 100644
--- a/epan/dissectors/packet-bacapp.c
+++ b/epan/dissectors/packet-bacapp.c
@@ -2310,22 +2310,6 @@ proto_register_bacapp(void);
void
proto_reg_handoff_bacapp(void);
-/**
- * converts XXX coded strings to UTF-8
- * else 'in' is copied to 'out'
- * @param in -- pointer to string
- * @param inbytesleft size of int bytes
- * @param out -- pointer to string
- * @param outbytesleft size of out bytes
- * @param fromcoding coding type
- * @return count of modified characters of returned string, -1 for errors
- */
-static guint32
-fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding);
-
-static void
-uni_to_string(char * data, gsize str_length, char *dest_buf);
-
/* <<<< formerly bacapp.h */
/* reassembly table for segmented messages */
@@ -6245,9 +6229,8 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs
{
guint8 tag_no, tag_info, character_set;
guint32 lvt, l;
- gsize inbytesleft, outbytesleft = 512;
+ gsize inbytesleft;
guint offs, extra = 1;
- guint8 *str_val;
const char *coding;
guint8 bf_arr[512], *out = &bf_arr[0];
proto_item *ti;
@@ -6299,8 +6282,7 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs
coding = "JIS C 6226";
break;
case ISO_10646_UCS4:
- str_val = tvb_get_string(wmem_packet_scope(), tvb, offset, l);
- fConvertXXXtoUTF8(str_val, &inbytesleft, out, &outbytesleft, "UCS-4BE");
+ out = tvb_get_string_enc(wmem_packet_scope(), tvb, offset, l, ENC_UCS_4|ENC_BIG_ENDIAN);
coding = "ISO 10646 UCS-4";
break;
case ISO_10646_UCS2:
@@ -11040,68 +11022,6 @@ bacapp_init_routine(void)
&addresses_reassembly_table_functions);
}
-static guint32
-fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding)
-{
- guint32 i;
- GIConv icd;
-
- if ((icd = g_iconv_open("UTF-8", fromcoding)) != (GIConv) -1) {
- i = (guint32) g_iconv(icd, &in, inbytesleft, &out, outbytesleft);
- /* g_iconv incremented 'out'; now ensure it's NULL terminated */
- out[0] = '\0';
-
- g_iconv_close(icd);
- return i;
- }
-
- uni_to_string(in, *inbytesleft, out);
- out[*inbytesleft] = '\0';
- *outbytesleft -= *inbytesleft;
- *inbytesleft = 0;
-
- return 0;
-}
-
-static void
-uni_to_string(char * data, gsize str_length, char *dest_buf)
-{
- gint i;
- guint16 c_char;
- gsize length_remaining;
-
- length_remaining = str_length;
- dest_buf[0] = '\0';
- if (str_length == 0) {
- return;
- }
- for ( i = 0; i < (gint) str_length; i++ ) {
- c_char = data[i];
- if ((c_char < 0x20) || (c_char > 0x7e)) {
- if (c_char != 0x00) {
- c_char = '.';
- dest_buf[i] = c_char & 0xff;
- } else {
- i--;
- str_length--;
- }
- } else {
- dest_buf[i] = c_char & 0xff;
- }
- length_remaining--;
-
- if (length_remaining == 0) {
- dest_buf[i+1] = '\0';
- return;
- }
- }
- if (i < 0) {
- i = 0;
- }
- dest_buf[i] = '\0';
- return;
-}
-
void
proto_register_bacapp(void)
{
diff --git a/epan/proto.h b/epan/proto.h
index 8a299e397b..2c29514a73 100644
--- a/epan/proto.h
+++ b/epan/proto.h
@@ -270,10 +270,6 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa
* For UTF-8, invalid UTF-8 sequences should be mapped to the same
* code point.
*
- * We also don't process UTF-16 or UCS-2 differently - we don't
- * handle surrogate pairs, and don't handle 2-byte values that
- * aren't valid in UTF-16 or UCS-2 strings.
- *
* For display, perhaps we should also map control characters to the
* Unicode glyphs showing the name of the control character in small
* caps, diagonally. (Unfortunately, those only exist for C0, not C1.)
@@ -283,31 +279,31 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa
#define ENC_UTF_8 0x00000002
#define ENC_UTF_16 0x00000004
#define ENC_UCS_2 0x00000006
-#define ENC_EBCDIC 0x00000008
-#define ENC_WINDOWS_1250 0x0000000A
-#define ENC_ISO_8859_1 0x0000000C
-#define ENC_ISO_8859_2 0x0000000E
-#define ENC_ISO_8859_3 0x00000010
-#define ENC_ISO_8859_4 0x00000012
-#define ENC_ISO_8859_5 0x00000014
-#define ENC_ISO_8859_6 0x00000016
-#define ENC_ISO_8859_7 0x00000018
-#define ENC_ISO_8859_8 0x0000001A
-#define ENC_ISO_8859_9 0x0000001C
-#define ENC_ISO_8859_10 0x0000001E
-#define ENC_ISO_8859_11 0x00000020
-/* #define ENC_ISO_8859_12 0x00000022 ISO 8859-12 was abandoned */
-#define ENC_ISO_8859_13 0x00000024
-#define ENC_ISO_8859_14 0x00000026
-#define ENC_ISO_8859_15 0x00000028
-#define ENC_ISO_8859_16 0x0000002A
+#define ENC_UCS_4 0x00000008
+#define ENC_ISO_8859_1 0x0000000A
+#define ENC_ISO_8859_2 0x0000000C
+#define ENC_ISO_8859_3 0x0000000E
+#define ENC_ISO_8859_4 0x00000010
+#define ENC_ISO_8859_5 0x00000012
+#define ENC_ISO_8859_6 0x00000014
+#define ENC_ISO_8859_7 0x00000016
+#define ENC_ISO_8859_8 0x00000018
+#define ENC_ISO_8859_9 0x0000001A
+#define ENC_ISO_8859_10 0x0000001C
+#define ENC_ISO_8859_11 0x0000001E
+/* #define ENC_ISO_8859_12 0x00000020 ISO 8859-12 was abandoned */
+#define ENC_ISO_8859_13 0x00000022
+#define ENC_ISO_8859_14 0x00000024
+#define ENC_ISO_8859_15 0x00000026
+#define ENC_ISO_8859_16 0x00000028
+#define ENC_WINDOWS_1250 0x0000002A
+#define ENC_EBCDIC 0x0000002C
/*
* TODO:
*
* These could probably be used by existing code:
*
- * ENC_UCS_4 - UCS-4
* - "IBM MS DBCS"
* - JIS C 6226
* 7-bit encodings such as ETSI 03.38 (GSM SMS character set
diff --git a/epan/tvbuff.c b/epan/tvbuff.c
index c7846f05ce..442c0e6ed2 100644
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@@ -1924,7 +1924,7 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
* Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8
* string with the same characters.
*
- * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
@@ -1967,7 +1967,7 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* Given a UTF-16 encoded Unicode string, return a UTF-8 string with the
* same characters.
*
- * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
@@ -2074,6 +2074,50 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
}
/*
+ * Given a UCS-4-encoded Unicode string, return a UTF-8 string with the
+ * same characters.
+ *
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ *
+ * Specify length in bytes
+ *
+ * If scope is NULL, memory is allocated with g_malloc() and user must
+ * explicitely free it with g_free().
+ * If scope is not NULL, memory is allocated with the corresponding pool
+ * lifetime.
+ *
+ * XXX - should map lead and trail surrogate values, and code points beyond
+ * the maximum Unicode character, to a "substitute" UTF-8 character?
+ */
+static gchar *
+tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
+{
+ gunichar uchar;
+ gint i; /* Byte counter for tvbuff */
+ wmem_strbuf_t *strbuf;
+
+ tvb_ensure_bytes_exist(tvb, offset, length);
+
+ strbuf = wmem_strbuf_new(scope, NULL);
+
+ for(i = 0; i + 3 < length; i += 2) {
+ if (encoding == ENC_BIG_ENDIAN)
+ uchar = tvb_get_ntohl(tvb, offset + i);
+ else
+ uchar = tvb_get_letohl(tvb, offset + i);
+
+ wmem_strbuf_append_unichar(strbuf, uchar);
+ }
+
+ /*
+ * XXX - if i < length, this means we were handed a number
+ * of bytes that's not a multiple of 4, so we're not a valid
+ * UCS-4 string.
+ */
+ return (gchar*)wmem_strbuf_get_str(strbuf);
+}
+
+/*
* Given a tvbuff, an offset, a length, and an encoding, allocate a
* buffer big enough to hold a non-null-terminated string of that length
* at that offset, plus a trailing '\0', copy into the buffer the
@@ -2131,24 +2175,9 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
encoding & ENC_LITTLE_ENDIAN);
break;
- case ENC_EBCDIC:
- /*
- * XXX - do the copy and conversion in one pass.
- *
- * XXX - multiple "dialects" of EBCDIC?
- */
- tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
- strbuf = (guint8 *)wmem_alloc(scope, length + 1);
- if (length != 0) {
- ptr = ensure_contiguous(tvb, offset, length);
- memcpy(strbuf, ptr, length);
- EBCDIC_to_ASCII(strbuf, length);
- }
- strbuf[length] = '\0';
- break;
-
- case ENC_WINDOWS_1250:
- strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250);
+ case ENC_UCS_4:
+ strbuf = tvb_get_ucs_4_string(scope, tvb, offset, length,
+ encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_ISO_8859_1:
@@ -2215,6 +2244,26 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
case ENC_ISO_8859_16:
strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_iso_8859_16);
break;
+
+ case ENC_WINDOWS_1250:
+ strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250);
+ break;
+
+ case ENC_EBCDIC:
+ /*
+ * XXX - do the copy and conversion in one pass.
+ *
+ * XXX - multiple "dialects" of EBCDIC?
+ */
+ tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
+ strbuf = (guint8 *)wmem_alloc(scope, length + 1);
+ if (length != 0) {
+ ptr = ensure_contiguous(tvb, offset, length);
+ memcpy(strbuf, ptr, length);
+ EBCDIC_to_ASCII(strbuf, length);
+ }
+ strbuf[length] = '\0';
+ break;
}
return strbuf;
}
@@ -2298,7 +2347,7 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
* Version of tvb_get_stringz() that handles the Basic Multilingual Plane
* (plane 0) of Unicode, with each code point encoded in 16 bits.
*
- * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
@@ -2310,7 +2359,7 @@ static gchar *
tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar2 uchar;
- gint size; /* Number of UTF-16 characters */
+ gint size; /* Number of bytes in string */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
@@ -2357,6 +2406,52 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset
return (gchar*)wmem_strbuf_get_str(strbuf);
}
+/*
+ * Version of tvb_get_stringz() that handles Unicode, with each code point
+ * encoded in 32 bits.
+ *
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ *
+ * Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
+ *
+ * XXX - needs to map values that are not valid Unicode characters (such as,
+ * I think, values used as the components of a UTF-16 surrogate pair) to a
+ * "substitute" UTF-8 character.
+ */
+static gchar *
+tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
+{
+ gunichar uchar;
+ gint size; /* Number of bytes in string */
+ gint i; /* Byte counter for tvbuff */
+ wmem_strbuf_t *strbuf;
+
+ DISSECTOR_ASSERT(tvb && tvb->initialized);
+
+ size = 0;
+ do {
+ /* Endianness doesn't matter when looking for null */
+ uchar = tvb_get_ntohl(tvb, offset + size);
+ size += 4;
+ } while(uchar != 0);
+
+ strbuf = wmem_strbuf_new(scope, NULL);
+
+ for(i = 0; i < size; i += 4) {
+ if (encoding == ENC_BIG_ENDIAN)
+ uchar = tvb_get_ntohl(tvb, offset + i);
+ else
+ uchar = tvb_get_letohl(tvb, offset + i);
+
+ wmem_strbuf_append_unichar(strbuf, uchar);
+ }
+
+ if (lengthp)
+ *lengthp = i; /* Number of *bytes* processed */
+
+ return (gchar*)wmem_strbuf_get_str(strbuf);
+}
+
guint8 *
tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@@ -2400,22 +2495,9 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
encoding & ENC_LITTLE_ENDIAN);
break;
- case ENC_EBCDIC:
- /*
- * XXX - do the copy and conversion in one pass.
- *
- * XXX - multiple "dialects" of EBCDIC?
- */
- size = tvb_strsize(tvb, offset);
- strptr = (guint8 *)wmem_alloc(scope, size);
- tvb_memcpy(tvb, strptr, offset, size);
- EBCDIC_to_ASCII(strptr, size);
- if (lengthp)
- *lengthp = size;
- break;
-
- case ENC_WINDOWS_1250:
- strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250);
+ case ENC_UCS_4:
+ strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp,
+ encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_ISO_8859_1:
@@ -2482,6 +2564,24 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
case ENC_ISO_8859_16:
strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_iso_8859_16);
break;
+
+ case ENC_WINDOWS_1250:
+ strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250);
+ break;
+
+ case ENC_EBCDIC:
+ /*
+ * XXX - do the copy and conversion in one pass.
+ *
+ * XXX - multiple "dialects" of EBCDIC?
+ */
+ size = tvb_strsize(tvb, offset);
+ strptr = (guint8 *)wmem_alloc(scope, size);
+ tvb_memcpy(tvb, strptr, offset, size);
+ EBCDIC_to_ASCII(strptr, size);
+ if (lengthp)
+ *lengthp = size;
+ break;
}
return strptr;