aboutsummaryrefslogtreecommitdiffstats
path: root/epan/charsets.c
diff options
context:
space:
mode:
authorGuy Harris <guy@alum.mit.edu>2019-07-14 20:18:14 -0700
committerGuy Harris <guy@alum.mit.edu>2019-07-15 07:50:30 +0000
commite26e0b4de071caa9bbfbde61dcc682ab6ede099e (patch)
tree74d2e51abaddae97ce8add71750aa03b57fc7206 /epan/charsets.c
parent258a5f6a173f7ae5fa9ccf9e709b22bbaa4f190b (diff)
Add support for the ISO 646 "Basic code table" encoding.
The "Basic code table" in ISO 646 is mostly ASCII, but some code points either 1) have more than one glyph that can be assigned to them or 2) have no glyph assigned to them. National versions choose one of the two glyphs for the code points in group 1) and assign specific glyphs to the code points in group 2); the International Reference Version assigns the same glyphs to those code points as does ASCII. For the "Basic code table" encoding, we map the code points in groups 1) and 2) to a REPLACEMENT CHARACTER; additional encodings can be added for the national versions. Add ENC_ISO_646_IRV (International Reference Version) as an alias for ENC_ASCII. Expand some comments, and add some comments, while we're at it. Change-Id: I4f1b5e426ec193775e919731c5cae1224dc65115 Reviewed-on: https://code.wireshark.org/review/33941 Petri-Dish: Guy Harris <guy@alum.mit.edu> Tested-by: Petri Dish Buildbot Reviewed-by: Guy Harris <guy@alum.mit.edu>
Diffstat (limited to 'epan/charsets.c')
-rw-r--r--epan/charsets.c71
1 files changed, 62 insertions, 9 deletions
diff --git a/epan/charsets.c b/epan/charsets.c
index f953a4727e..aa6c237703 100644
--- a/epan/charsets.c
+++ b/epan/charsets.c
@@ -81,6 +81,59 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
}
/*
+ * ISO 646 "Basic code table".
+ */
+const gunichar2 charset_table_iso_646_basic[0x80] = {
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* 0x00 - */
+ 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* - 0x0F */
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, /* 0x10 - */
+ 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* - 0x1F */
+ 0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027, /* 0x20 - */
+ 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* - 0x2F */
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x30 - */
+ 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* - 0x3F */
+ UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x40 - */
+ 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* - 0x4F */
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x50 - */
+ 0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f, /* - 0x5F */
+ UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x60 - */
+ 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* - 0x6F */
+ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x70 - */
+ 0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f, /* - 0x7F */
+};
+
+/*
+ * Given a wmem scope, a pointer, a length, and a translation table,
+ * treat the string of bytes referred to by the pointer and length as a
+ * string encoded using one octet per character, with octets with the
+ * high-order bit clear being mapped by the translation table to 2-byte
+ * Unicode Basic Multilingual Plane characters (including REPLACEMENT
+ * CHARACTER) and octets with the high-order bit set being mapped to
+ * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
+ * allocated using the wmem scope.
+ */
+guint8 *
+get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
+{
+ wmem_strbuf_t *str;
+
+ str = wmem_strbuf_sized_new(scope, length+1, 0);
+
+ while (length > 0) {
+ guint8 ch = *ptr;
+
+ if (ch < 0x80)
+ wmem_strbuf_append_unichar(str, table[ch]);
+ else
+ wmem_strbuf_append_unichar(str, UNREPL);
+ ptr++;
+ length--;
+ }
+
+ return (guint8 *) wmem_strbuf_finalize(str);
+}
+
+/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as an ISO 8859/1 string, and
* return a pointer to a UTF-8 string, allocated using the wmem scope.
@@ -577,11 +630,11 @@ const gunichar2 charset_table_cp866[0x80] = {
};
/*
- * Given a wmem scope, a pointer, a length, and a translation table,
- * treat the string of bytes referred to by the pointer and length as a
- * string encoded using one octet per character, with octets with the
- * high-order bit clear being ASCII and octets with the high-order bit
- * set being mapped by the translation table to 2-byte Unicode Basic
+ * Given a wmem scope, a pointer, a length, and a translation table with
+ * 128 entries, treat the string of bytes referred to by the pointer and
+ * length as a string encoded using one octet per character, with octets
+ * with the high-order bit clear being ASCII and octets with the high-order
+ * bit set being mapped by the translation table to 2-byte Unicode Basic
* Multilingual Plane characters (including REPLACEMENT CHARACTER), and
* return a pointer to a UTF-8 string, allocated using the wmem scope.
*/
@@ -1196,10 +1249,10 @@ const gunichar2 charset_table_ebcdic_cp037[256] = {
};
/*
- * Given a wmem scope, a pointer, a length, and a translation table,
- * treat the string of bytes referred to by the pointer and length as a
- * string encoded using one octet per character, with octets being
- * mapped by the translation table to 2-byte Unicode Basic Multilingual
+ * Given a wmem scope, a pointer, a length, and a translation table with
+ * 256 entries, treat the string of bytes referred to by the pointer and
+ * length as a string encoded using one octet per character, with octets
+ * being mapped by the translation table to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), and return a
* pointer to a UTF-8 string, allocated using the wmem scope.
*/