make-manuf.py: Improve truncation.

If the PyICU module is available, use it to truncate manufacturer names by grapheme clusters. Change-Id: Ib7dcbb126809df496a534f44a47871a1b28dc539 Reviewed-on: https://code.wireshark.org/review/29660 Reviewed-by: Gerald Combs <gerald@wireshark.org>
author: Gerald Combs <gerald.combs@riverbed.com> 2018-09-14 17:41:28 +0000
committer: Gerald Combs <gerald@wireshark.org> 2018-09-14 18:01:49 +0000
commit: 56a30766ef40812fe79a0b04461d91a7aa170825 (patch)
tree: 55d83265a2f3501d14e56e598d78f86e3a891d28
parent: 17c81011079acd905b082b8fc8caaef0156fad59 (diff)
1 files changed, 31 insertions, 8 deletions
diff --git a/tools/make-manuf.py b/tools/make-manuf.py
index 693cacd112..a60fa946dd 100755
--- a/tools/make-manuf.py
+++ b/tools/make-manuf.py
@@ -32,6 +32,14 @@ if sys.version_info[0] >= 3:
 else:
     import urllib
 
+have_icu = False
+try:
+    # Use the grapheme or segments module instead?
+    import icu
+    have_icu = True
+except ImportError:
+    pass
+
 def exit_msg(msg=None, status=1):
     if msg is not None:
         sys.stderr.write(msg + '\n\n')
@@ -69,15 +77,30 @@ def shorten(manuf):
     manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE)
     # Remove all spaces
     manuf = re.sub('\s+', '', manuf)
-    # Truncate all names to a reasonable length, say, 8 characters.
-    # If the string contains UTF-8, this may be substantially more than 8
-    # bytes. It might also be less than 8 visible characters. Python slices
-    # unicode strings by code point, which is better than raw bytes but not
-    # as good as grapheme clusters. https://bugs.python.org/issue30717
+
+    # Truncate names to a reasonable length, say, 8 characters. If
+    # the string contains UTF-8, this may be substantially more than
+    # 8 bytes. It might also be less than 8 visible characters. Plain
+    # Python slices Unicode strings by code point, which is better
+    # than raw bytes but not as good as grapheme clusters. PyICU
+    # supports grapheme clusters. https://bugs.python.org/issue30717
     #
-    # In our case 'Savroni̇k Elektroni̇k' is truncated to 'Savroni̇', which
-    # is 7 visible characters, 8 code points, and 9 bytes.
-    manuf = manuf[:8]
+    # In our case plain Python truncates 'Savroni̇k Elektroni̇k'
+    # to 'Savroni̇', which is 7 visible characters, 8 code points,
+    # and 9 bytes.
+
+    # Truncate by code points
+    trunc_len = 8
+
+    if have_icu:
+        # Truncate by grapheme clusters
+        bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
+        bi_ci.setText(manuf)
+        bounds = list(bi_ci)
+        bounds = bounds[0:8]
+        trunc_len = bounds[-1]
+
+    manuf = manuf[:trunc_len]
 
     if manuf.lower() == orig_manuf.lower():
         # Original manufacturer name was short and simple.
author	Gerald Combs <gerald.combs@riverbed.com>	2018-09-14 17:41:28 +0000
committer	Gerald Combs <gerald@wireshark.org>	2018-09-14 18:01:49 +0000
commit	56a30766ef40812fe79a0b04461d91a7aa170825 (patch)
tree	55d83265a2f3501d14e56e598d78f86e3a891d28
parent	17c81011079acd905b082b8fc8caaef0156fad59 (diff)