aboutsummaryrefslogtreecommitdiffstats
path: root/epan/charsets.h
diff options
context:
space:
mode:
authorGuy Harris <guy@alum.mit.edu>2019-07-14 20:18:14 -0700
committerGuy Harris <guy@alum.mit.edu>2019-07-15 07:50:30 +0000
commite26e0b4de071caa9bbfbde61dcc682ab6ede099e (patch)
tree74d2e51abaddae97ce8add71750aa03b57fc7206 /epan/charsets.h
parent258a5f6a173f7ae5fa9ccf9e709b22bbaa4f190b (diff)
Add support for the ISO 646 "Basic code table" encoding.
The "Basic code table" in ISO 646 is mostly ASCII, but some code points either 1) have more than one glyph that can be assigned to them or 2) have no glyph assigned to them. National versions choose one of the two glyphs for the code points in group 1) and assign specific glyphs to the code points in group 2); the International Reference Version assigns the same glyphs to those code points as does ASCII. For the "Basic code table" encoding, we map the code points in groups 1) and 2) to a REPLACEMENT CHARACTER; additional encodings can be added for the national versions. Add ENC_ISO_646_IRV (International Reference Version) as an alias for ENC_ASCII. Expand some comments, and add some comments, while we're at it. Change-Id: I4f1b5e426ec193775e919731c5cae1224dc65115 Reviewed-on: https://code.wireshark.org/review/33941 Petri-Dish: Guy Harris <guy@alum.mit.edu> Tested-by: Petri Dish Buildbot Reviewed-by: Guy Harris <guy@alum.mit.edu>
Diffstat (limited to 'epan/charsets.h')
-rw-r--r--epan/charsets.h88
1 files changed, 88 insertions, 0 deletions
diff --git a/epan/charsets.h b/epan/charsets.h
index 2a5306a757..9e5dd239fa 100644
--- a/epan/charsets.h
+++ b/epan/charsets.h
@@ -53,6 +53,12 @@ extern const gunichar2 charset_table_cp437[0x80];
extern const gunichar2 charset_table_cp855[0x80];
extern const gunichar2 charset_table_cp866[0x80];
+/*
+ * Translation tables that map the lower 128 code points in single-byte
+ * ISO 646-based character encodings to Unicode code points in the
+ * Basic Multilingual Plane.
+ */
+extern const gunichar2 charset_table_iso_646_basic[0x80];
/* Tables for EBCDIC code pages */
extern const gunichar2 charset_table_ebcdic[256];
@@ -70,18 +76,92 @@ extern const gunichar2 charset_table_ebcdic_cp037[256];
WS_DLL_PUBLIC guint8 *
get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+/*
+ * Given a wmem scope, a pointer, a length, and a translation table,
+ * treat the string of bytes referred to by the pointer and length as a
+ * string encoded using one octet per character, with octets with the
+ * high-order bit clear being mapped by the translation table to 2-byte
+ * Unicode Basic Multilingual Plane characters (including REPLACEMENT
+ * CHARACTER) and octets with the high-order bit set being mapped to
+ * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
+ * allocated using the wmem scope.
+ */
+WS_DLL_PUBLIC guint8 *
+get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
+
+/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as an ISO 8859/1 string, and
+ * return a pointer to a UTF-8 string, allocated using the wmem scope.
+ */
WS_DLL_PUBLIC guint8 *
get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+/*
+ * Given a wmem scope, a pointer, a length, and a translation table with
+ * 128 entries, treat the string of bytes referred to by the pointer and
+ * length as a string encoded using one octet per character, with octets
+ * with the high-order bit clear being ASCII and octets with the high-order
+ * bit set being mapped by the translation table to 2-byte Unicode Basic
+ * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
+ * return a pointer to a UTF-8 string, allocated using the wmem scope.
+ */
WS_DLL_PUBLIC guint8 *
get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
+/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UCS-2 encoded string
+ * containing characters from the Basic Multilingual Plane (plane 0) of
+ * Unicode, and return a pointer to a UTF-8 string, allocated with the
+ * wmem scope.
+ *
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ *
+ * Specify length in bytes.
+ *
+ * XXX - should map lead and trail surrogate values to REPLACEMENT
+ * CHARACTERs (0xFFFD)?
+ * XXX - if there are an odd number of bytes, should put a
+ * REPLACEMENT CHARACTER at the end.
+ */
WS_DLL_PUBLIC guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
+/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UTF-16 encoded string, and
+ * return a pointer to a UTF-8 string, allocated with the wmem scope.
+ *
+ * See RFC 2781 section 2.2.
+ *
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ *
+ * Specify length in bytes.
+ *
+ * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
+ * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
+ * XXX - if there are an odd number of bytes, should put a
+ * REPLACEMENT CHARACTER at the end.
+ */
WS_DLL_PUBLIC guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
+/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UCS-4 encoded string, and
+ * return a pointer to a UTF-8 string, allocated with the wmem scope.
+ *
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ *
+ * Specify length in bytes
+ *
+ * XXX - should map lead and trail surrogate values to a "substitute"
+ * UTF-8 character?
+ * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
+ * XXX - if the number of bytes isn't a multiple of 4, should put a
+ * REPLACEMENT CHARACTER at the end.
+ */
WS_DLL_PUBLIC guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@@ -93,6 +173,14 @@ WS_DLL_PUBLIC guint8 *
get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
const gint bit_offset, gint no_of_chars);
+/*
+ * Given a wmem scope, a pointer, a length, and a translation table with
+ * 256 entries, treat the string of bytes referred to by the pointer and
+ * length as a string encoded using one octet per character, with octets
+ * being mapped by the translation table to 2-byte Unicode Basic Multilingual
+ * Plane characters (including REPLACEMENT CHARACTER), and return a
+ * pointer to a UTF-8 string, allocated using the wmem scope.
+ */
WS_DLL_PUBLIC guint8 *
get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);