aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGerald Combs <gerald.combs@riverbed.com>2018-09-14 17:41:28 +0000
committerGerald Combs <gerald@wireshark.org>2018-09-14 18:01:49 +0000
commit56a30766ef40812fe79a0b04461d91a7aa170825 (patch)
tree55d83265a2f3501d14e56e598d78f86e3a891d28
parent17c81011079acd905b082b8fc8caaef0156fad59 (diff)
make-manuf.py: Improve truncation.
If the PyICU module is available, use it to truncate manufacturer names by grapheme clusters. Change-Id: Ib7dcbb126809df496a534f44a47871a1b28dc539 Reviewed-on: https://code.wireshark.org/review/29660 Reviewed-by: Gerald Combs <gerald@wireshark.org>
-rwxr-xr-xtools/make-manuf.py39
1 files changed, 31 insertions, 8 deletions
diff --git a/tools/make-manuf.py b/tools/make-manuf.py
index 693cacd112..a60fa946dd 100755
--- a/tools/make-manuf.py
+++ b/tools/make-manuf.py
@@ -32,6 +32,14 @@ if sys.version_info[0] >= 3:
else:
import urllib
+have_icu = False
+try:
+ # Use the grapheme or segments module instead?
+ import icu
+ have_icu = True
+except ImportError:
+ pass
+
def exit_msg(msg=None, status=1):
if msg is not None:
sys.stderr.write(msg + '\n\n')
@@ -69,15 +77,30 @@ def shorten(manuf):
manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE)
# Remove all spaces
manuf = re.sub('\s+', '', manuf)
- # Truncate all names to a reasonable length, say, 8 characters.
- # If the string contains UTF-8, this may be substantially more than 8
- # bytes. It might also be less than 8 visible characters. Python slices
- # unicode strings by code point, which is better than raw bytes but not
- # as good as grapheme clusters. https://bugs.python.org/issue30717
+
+ # Truncate names to a reasonable length, say, 8 characters. If
+ # the string contains UTF-8, this may be substantially more than
+ # 8 bytes. It might also be less than 8 visible characters. Plain
+ # Python slices Unicode strings by code point, which is better
+ # than raw bytes but not as good as grapheme clusters. PyICU
+ # supports grapheme clusters. https://bugs.python.org/issue30717
#
- # In our case 'Savroni̇k Elektroni̇k' is truncated to 'Savroni̇', which
- # is 7 visible characters, 8 code points, and 9 bytes.
- manuf = manuf[:8]
+ # In our case plain Python truncates 'Savroni̇k Elektroni̇k'
+ # to 'Savroni̇', which is 7 visible characters, 8 code points,
+ # and 9 bytes.
+
+ # Truncate by code points
+ trunc_len = 8
+
+ if have_icu:
+ # Truncate by grapheme clusters
+ bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
+ bi_ci.setText(manuf)
+ bounds = list(bi_ci)
+ bounds = bounds[0:8]
+ trunc_len = bounds[-1]
+
+ manuf = manuf[:trunc_len]
if manuf.lower() == orig_manuf.lower():
# Original manufacturer name was short and simple.