aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-03-25 20:51:01 +0100
committerAnders Broman <a.broman58@gmail.com>2015-03-26 07:14:20 +0000
commit13c486c33f9d3a015aaf0e4ee0c99643575cd235 (patch)
treee642365ba3a64564c35810d70100b70da09e8330
parent785fab7f01c98c54295596d4a25a2d2bc7343335 (diff)
html2text.py: Python 2.5 compatibility, improve Unicode support
Add support for Python 2.5 (remove unicode_literals import which does not seem to be necessary), check before using break_on_hyphens (without this option in Python 2.5, the output is uglier though) and avoid the 'with' keyword. While at it, fix reading Unicode text from file in Python 2 (tested with: echo € | tools/html2text.py) and support reading from stdin using the '-' filename. Tested against Python 2.5.6, 2.6.6, 2.7.9, 3.2.6, 3.4.3 with the commands from the previous html2text.py commit message, and additionally with the Unicode character € as input (instead of the faq.py output). Change-Id: I3de3f7a4e7cf7d702463c3a59758803843338a54 Reviewed-on: https://code.wireshark.org/review/7823 Reviewed-by: Peter Wu <peter@lekensteyn.nl> Tested-by: Peter Wu <peter@lekensteyn.nl> Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com>
-rwxr-xr-xtools/html2text.py29
1 files changed, 16 insertions, 13 deletions
diff --git a/tools/html2text.py b/tools/html2text.py
index 54180fabcf..3f81b191bb 100755
--- a/tools/html2text.py
+++ b/tools/html2text.py
@@ -20,8 +20,6 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-from __future__ import unicode_literals
-
__author__ = "Peter Wu <peter@lekensteyn.nl>"
__copyright__ = "Copyright 2015, Peter Wu"
__license__ = "GPL (v2 or later)"
@@ -68,8 +66,14 @@ class TextHTMLParser(HTMLParser):
if self.list_item_prefix:
initial_indent += self.list_item_prefix
indent += ' '
- wrapper = TextWrapper(width=66, break_on_hyphens=False,
- initial_indent=initial_indent, subsequent_indent=indent)
+ kwargs = {
+ 'width': 66,
+ 'initial_indent': initial_indent,
+ 'subsequent_indent': indent
+ }
+ if sys.version_info[0:2] >= (2, 6):
+ kwargs['break_on_hyphens'] = False
+ wrapper = TextWrapper(**kwargs)
return '\n'.join(wrapper.wrap(text))
def _commit_block(self, newline='\n\n'):
@@ -148,22 +152,21 @@ class TextHTMLParser(HTMLParser):
def main():
htmlparser = TextHTMLParser()
- if len(sys.argv) > 1:
- if sys.version_info[0] >= 3:
- # Python 3: read file as utf-8
- kwargs = { 'encoding': 'utf-8' }
- else:
- kwargs = {}
- with open(sys.argv[1], **kwargs) as f:
- for line in f:
- htmlparser.feed(line)
+ if len(sys.argv) > 1 and sys.argv[1] != '-':
+ filename = sys.argv[1]
+ f = open(filename, 'rb')
else:
+ filename = None
f = sys.stdin
+ try:
if hasattr(f, 'buffer'):
# Access raw (byte) buffer in Python 3 instead of decoded one
f = f.buffer
# Read stdin as as Unicode string
htmlparser.feed(f.read().decode('utf-8'))
+ finally:
+ if filename is not None:
+ f.close()
htmlparser.close()
if __name__ == '__main__':