json2pcap: Added pcap masking and anonymization support

The script includes the following changes: - Added pcap masking and anonymization support - Support to mask/anonymize only portion of field - Added reading from stdin - Changed json to ijson library to support large files - Migrated from text2pcap to scapy for pcap generation - Added version to script The development repo is located here https://github.com/H21lab/json2pcap Change-Id: I8fc5e282caa604e188f05818f7a2f8875afb8b73 Reviewed-on: https://code.wireshark.org/review/37371 Reviewed-by: Dario Lombardo <lomato@gmail.com>
author: Martin Kacer <kacer.martin@gmail.com> 2020-06-03 07:34:43 +0200
committer: Dario Lombardo <lomato@gmail.com> 2020-07-16 14:50:09 +0000
commit: 9b5f07d8295d1857d2039f6b868e1848ec13bc60 (patch)
tree: b5407e54c250ce60a27552982698f75dd1709854 /tools
parent: 3dedaf80648c00a5c1e7b6498d553f793600cc23 (diff)
1 files changed, 270 insertions, 97 deletions
diff --git a/tools/json2pcap/json2pcap.py b/tools/json2pcap/json2pcap.py
index 0f9cebdc0b..3c551be1a2 100644
--- a/tools/json2pcap/json2pcap.py
+++ b/tools/json2pcap/json2pcap.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 #
-# Copyright 2017, Martin Kacer <kacer.martin[AT]gmail.com>
+# Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors
 #
 # Wireshark - Network traffic analyzer
 # By Gerald Combs <gerald@wireshark.org>
@@ -11,7 +11,7 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
 import sys
-import json
+import ijson
 import operator
 import copy
 import os
@@ -19,7 +19,89 @@ import binascii
 import array
 import argparse
 import subprocess
+import string
+import random
+import math
+import hashlib
+import re
 from collections import OrderedDict
+from scapy import all as scapy
+
+try:
+    # Python 2 forward compatibility
+    range = xrange
+except NameError:
+    pass
+
+# Field anonymization class
+class AnonymizedField:
+    '''
+    The Anonymization field object specifying anonymization
+    :filed arg: field name
+    :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256]
+    :start arg: If specified, the anonymization starts at given byte number
+    :end arg: If specified, the anonymization ends at given byte number
+    '''
+    def __init__(self, field, type):
+        self.field = field
+        self.type = type
+        self.start = None
+        self.end = None
+
+        match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field)
+        if match:
+            self.field = match.group(1)
+            self.start = match.group(2)
+            if self.start is not None:
+                self.start = int(self.start)
+            self.end = match.group(3)
+            if self.end is not None:
+                self.end = int(self.end)
+
+    # Returns the new field value after anonymization
+    def anonymize_field_shake256(self, field, type, salt):
+        shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8'))
+
+        # String type, output should be ASCII
+        if type in [26, 27, 28]:
+            length = math.ceil(len(field)/4)
+            shake_hash = shake.hexdigest(length)
+            ret_string = array.array('B', str.encode(shake_hash))
+            ret_string = ''.join('{:02x}'.format(x) for x in ret_string)
+        # Other types, output could be HEX
+        else:
+            length = math.ceil(len(field)/2)
+            shake_hash = shake.hexdigest(length)
+            ret_string = shake_hash
+
+        # Correct the string length
+        if (len(ret_string) < len(field)):
+            ret_string = ret_string.ljust(len(field))
+        if (len(ret_string) > len(field)):
+            ret_string = ret_string[:len(field)]
+
+        return ret_string
+
+    def anonymize_field(self, _h, _t, salt):
+        s = 0
+        e = None
+        if self.start:
+            s = self.start
+        if self.end:
+            e = self.end
+            if e < 0:
+                e = len(_h) + e
+        else:
+            e = len(_h)
+        h = _h[s:e]
+        if self.type == 0:
+            h = 'f' * len(h)
+        elif self.type == 1:
+            h = self.anonymize_field_shake256(h, _t, salt)
+
+        h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:])
+        h = _h[0:s] + h + _h[e:]
+        return [h, h_mask]
 
 def make_unique(key, dct):
     counter = 0
@@ -69,7 +151,7 @@ py_header = """#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # File generated by json2pcap.py
-# json2pcap.py created by Martin Kacer, 2017
+# json2pcap.py created by Martin Kacer, 2020
 
 import os
 import binascii
@@ -77,6 +159,13 @@ import array
 import sys
 import subprocess
 from collections import OrderedDict
+from scapy import all as scapy
+
+try:
+    # Python 2 forward compatibility
+    range = xrange
+except NameError:
+    pass
 
 # *****************************************************
 # *     PACKET PAYLOAD GENERATED FROM INPUT PCAP      *
@@ -94,10 +183,9 @@ py_footer = """    generate_pcap(d)
 # *****************************************************
 
 """
-py_footer = py_footer + read_py_function("to_pcap_file")
-py_footer = py_footer + read_py_function("hex_to_txt")
 py_footer = py_footer + read_py_function("to_bytes")
 py_footer = py_footer + read_py_function("lsb")
+py_footer = py_footer + read_py_function("multiply_strings")
 py_footer = py_footer + read_py_function("rewrite_frame")
 py_footer = py_footer + read_py_function("assemble_frame")
 py_footer = py_footer + read_py_function("generate_pcap")
@@ -116,25 +204,6 @@ if __name__ == '__main__':
 #
 # ********** FUNCTIONS ***********
 #
-def to_pcap_file(filename, output_pcap_file):
-    subprocess.call(["text2pcap", filename, output_pcap_file])
-
-def hex_to_txt(hexstring, output_file):
-    h = hexstring.lower()
-
-    file = open(output_file, 'a')
-
-    for i in range(0, len(h), 2):
-        if(i % 32 == 0):
-            file.write(format(i / 2, '06x') + ' ')
-
-        file.write(h[i:i + 2] + ' ')
-
-        if(i % 32 == 30):
-            file.write('\n')
-
-    file.write('\n')
-    file.close()
 
 def raw_flat_collector(dict):
     if hasattr(dict, 'items'):
@@ -245,31 +314,46 @@ def py_generator(d, r, frame_name='frame_raw', frame_position=0):
                     for _v in v:
                         py_generator(_v, r, frame_name, frame_position)
 
-
-
-
 # To emulate Python 3.2
 def to_bytes(n, length, endianess='big'):
     h = '%x' % n
-    s = ('0' * (len(h) % 2) + h).zfill(length * 2).decode('hex')
+    s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2))
     return s if endianess == 'big' else s[::-1]
 
 # Returns the index, counting from 0, of the least significant set bit in x
 def lsb(x):
     return (x & -x).bit_length() - 1
 
+# Replace parts of original_string by new_string, only if mask in the byte is not ff
+def multiply_strings(original_string, new_string, mask):
+
+    ret_string = new_string
+    if mask == None:
+        return ret_string
+    for i in range(0, min(len(original_string), len(new_string), len(mask)), 2):
+        if mask[i:i + 2] == 'ff':
+            #print("ff")
+            ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:]
+
+    return ret_string
+
 # Rewrite frame
 # h - hex bytes
 # p - position
 # l - length
 # b - bitmask
 # t - type
-def rewrite_frame(frame_raw, h, p, l, b, t):
+# frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte)
+def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask = None):
+    if p < 0 or l < 0 or h is None:
+        return frame_raw
+
     # no bitmask
     if(b == 0):
         if (len(h) != l):
             l = len(h)
-        return frame_raw[:p] + h + frame_raw[p + l:]
+        frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:]
+        return multiply_strings(frame_raw, frame_raw_new, frame_amask)
     # bitmask
     else:
         # get hex string from frame which will be replaced
@@ -283,7 +367,7 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
 
         # Only replace bits defined by mask
         # new_hex = (old_hex & !mask) | (new_hex & mask)
-        _H = _h.decode("hex")
+        _H = bytearray.fromhex(_h)
         _H = array.array('B', _H)
 
         M = to_bytes(b, len(_H))
@@ -291,11 +375,11 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
         # shift mask aligned to position
         for i in range(len(M)):
             if (i + p / 2) < len(M):
-                M[i] = M[i + p / 2]
+                M[i] = M[i + int(p / 2)]
             else:
                 M[i] = 0x00
 
-        H = h.decode("hex")
+        H = bytearray.fromhex(h)
         H = array.array('B', H)
 
         # for i in range(len(_H)):
@@ -319,17 +403,20 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
         # print
 
         masked_h = binascii.hexlify(_H)
+        masked_h = masked_h.decode('ascii')
 
-        return frame_raw[:p] + masked_h + frame_raw[p + l:]
+        frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:]
+        return multiply_strings(frame_raw, frame_raw_new, frame_amask)
 
 
-def assemble_frame(d):
+def assemble_frame(d, frame_time):
     input = d['frame_raw'][1]
     isFlat = False
     linux_cooked_header = False;
     while(isFlat == False):
         isFlat = True
-        for key, val in d.items():
+        _d = d.copy()
+        for key, val in _d.items():
             h = str(val[1])     # hex
             p = val[2] * 2      # position
             l = val[3] * 2      # length
@@ -353,7 +440,7 @@ def assemble_frame(d):
 
     output = d['frame_raw'][1]
 
-    # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
+    # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame
     if (linux_cooked_header):
         output = "000000000000" + output[6*2:] # replce dest MAC
         output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol
@@ -363,10 +450,9 @@ def assemble_frame(d):
 def generate_pcap(d):
     # 1. Assemble frame
     input = d['frame_raw'][1]
-    output = assemble_frame(d)
+    output = assemble_frame(d, None)
     print(input)
     print(output)
-
     # 2. Testing: compare input and output for not modified json
     if (input != output):
         print("Modified frames: ")
@@ -375,24 +461,23 @@ def generate_pcap(d):
         print(s1)
         print(s2)
         if (len(s1) == len(s2)):
-            d = [i for i in xrange(len(s1)) if s1[i] != s2[i]]
+            d = [i for i in range(len(s1)) if s1[i] != s2[i]]
             print(d)
-
-    # 3. Open TMP file used by text2pcap
-    file = sys.argv[0] + '.tmp'
-    f = open(file,'w')
-    hex_to_txt(output, file)
-    f.close()
-
-    # 4. Generate pcap
-    to_pcap_file(sys.argv[0] + '.tmp', sys.argv[0] + '.pcap')
-    print("Generated " + sys.argv[0] + ".tmp")
-    print("Generated " + sys.argv[0] + ".pcap")
+    # 3. Generate pcap
+    outfile = sys.argv[0] + ".pcap"
+    pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
+    new_packet = scapy.Packet(bytearray.fromhex(output))
+    pcap_out.write(new_packet)
+    print("Generated " + outfile)
 
 #
 # ************ MAIN **************
 #
+VERSION = "1.1"
+
 parser = argparse.ArgumentParser(description="""
+json2pcap {version}
+
 Utility to generate pcap from json format.
 
 Packet modification:
@@ -416,41 +501,107 @@ encode the packet variables. The assembling algorithm is different, because
 the decoded packet fields are relative and points to parent node with their
 position (compared to input json which has absolute positions).
 
-""", formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument('infile', nargs='+', help='json generated by tshark -T jsonraw or by tshark -T json -x')
+Pcap masking and anonymization with -m and -a switch:
+The script allows to mask or anonymize the selected json raw fields. If the
+The fields are selected and located on  lower protocol layers, they are not
+The overwritten by  upper fields  which are not  marked by  these switches.
+The pcap masking and anonymization can be performed in the following way:
+
+tshark -r orig.pcap -T json -x  | \ python json2pcap.py -m "ip.src_raw"
+-a "ip.dst_raw" -o anonymized.pcap
+In this example the ip.src_raw field is masked with ffffffff by byte values
+and ip.dst_raw is hashed by randomly generated salt.
+
+Additionally the following syntax is valid to anonymize portion of field
+tshark -r orig.pcap -T json -x  | \ python json2pcap.py -m "ip.src_raw[2:]"
+-a "ip.dst_raw[:-2]" -o anonymized.pcap
+Where the src_ip first byte is preserved and dst_ip last byte is preserved.
+And the same can be achieved by
+tshark -r orig.pcap -T json -x  | \ python json2pcap.py -m "ip.src_raw[2:8]"
+-a "ip.dst_raw[0:6]" -o anonymized.pcap
+
+Masking and anonymization  limitations are mainly the following:
+- In case  the tshark is performing reassembling from  multiple frames, the
+backward pcap  reconstruction is not  properly performed and can  result in
+malformed frames.
+- The  new values  in the  fields could  violate the  field format,  as the
+json2pcap  is  no performing  correct  protocol  encoding with  respect  to
+allowed values of the target field and field encoding.
+
+""".format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
+parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.')
+parser.add_argument('-o', '--outfile', required=True, help='output pcap filename')
 parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true')
+parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD')
+parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD')
+parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None)
+parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true')
 args = parser.parse_args()
 
 # read JSON
-infile = args.infile[0]
+infile = args.infile
+outfile = args.outfile
 
-with open(infile) as data_file:
-    #json = json.load(data_file, object_pairs_hook=OrderedDict)
-    json = json.load(data_file, object_pairs_hook=parse_object_pairs)
+# Read from input file
+if infile:
+    data_file = open(infile)
+# Read from pipe
+else:
+    data_file = sys.stdin
+
+# Parse anonymization fields
+anonymize = {}
+if args.mask:
+    for m in args.mask:
+        if not '_raw' in m:
+            print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix")
+            sys.exit()
+        af = AnonymizedField(m, 0)
+        anonymize[af.field] = af
+if args.anonymize:
+    for a in args.anonymize:
+        if not '_raw' in a:
+            print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix")
+            sys.exit()
+        af = AnonymizedField(a, 1)
+        anonymize[af.field] = af
 
 input_frame_raw = ''
 frame_raw = ''
+frame_time = None
+
+salt = args.salt
+if salt is None:
+    # generate random salt if no salt was provided
+    salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10))
 
 # Generate pcap
 if args.python == False:
-    # open TMP file used by text2pcap
-    file = infile + '.tmp'
-    f = open(file, 'w')
+    pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
 
     # Iterate over packets in JSON
-    for packet in json:
+    for packet in ijson.items(data_file, "item", buf_size=200000):
         _list = []
         linux_cooked_header = False;
 
         # get flat raw fields into _list
         for raw in raw_flat_collector(packet['_source']['layers']):
-            if (raw[0] == "frame_raw"):
-                frame_raw = raw[1][0]
-                input_frame_raw = copy.copy(frame_raw)
-            else:
-                _list.append(raw[1])
-            if (raw[0] == "sll_raw"):
-                linux_cooked_header = True
+            if len(raw) >= 2:
+                if (raw[0] == "frame_raw"):
+                    frame_raw = raw[1][0]
+                    frame_amask = "0"*len(frame_raw) # initialize anonymization mask
+                    input_frame_raw = copy.copy(frame_raw)
+                    frame_time = None
+                    if 'frame.time_epoch' in packet['_source']['layers']['frame']:
+                        frame_time = packet['_source']['layers']['frame']['frame.time_epoch']
+                else:
+                    # add into value list into raw[5] the field name
+                    if isinstance(raw[1], list):
+                        raw[1].append(raw[0])
+                        _list.append(raw[1])
+                if (raw[0] == "sll_raw"):
+                    linux_cooked_header = True
 
         # sort _list
         sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False)
@@ -459,25 +610,47 @@ if args.python == False:
 
         # rewrite frame
         for raw in sorted_list:
-            h = str(raw[0])  # hex
-            p = raw[1] * 2  # position
-            l = raw[2] * 2  # length
-            b = raw[3]  # bitmask
-            t = raw[4]  # type
-
-            if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
-                for r in raw:
-                    _h = str(r[0])  # hex
-                    _p = r[1] * 2  # position
-                    _l = r[2] * 2  # length
-                    _b = r[3]  # bitmask
-                    _t = r[4]  # type
+            if len(raw) >= 6:
+                h = str(raw[0])  # hex
+                p = raw[1] * 2   # position
+                l = raw[2] * 2   # length
+                b = raw[3]       # bitmask
+                t = raw[4]       # type
+                # raw[5]         # field_name (added by script)
+                h_mask = h       # hex for anonymization mask
+
+                # anonymize fields
+                if (raw[5] in anonymize):
+                    [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt)
+
+                if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
+                    for r in raw:
+                        _h = str(r[0])  # hex
+                        _p = r[1] * 2   # position
+                        _l = r[2] * 2   # length
+                        _b = r[3]       # bitmask
+                        _t = r[4]       # type
+                        # raw[5]        # field_name (added by script)
+                        _h_mask = _h    # hex for anonymization mask
+
+                        # anonymize fields
+                        if (raw[5] in anonymize):
+                            [_h, _h_mask]  = anonymize[raw[5]].anonymize_field(_h, _t, salt)
+
+                        # print("Debug: " + str(raw))
+                        frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask)
+
+                        # update anonymization mask
+                        if (raw[5] in anonymize):
+                            frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t)
+
+                else:
                     # print("Debug: " + str(raw))
-                    frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t)
+                    frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask)
 
-            else:
-                # print("Debug: " + str(raw))
-                frame_raw = rewrite_frame(frame_raw, h, p, l, b, t)
+                    # update anonymization mask
+                    if (raw[5] in anonymize):
+                        frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t)
 
         # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
         if (linux_cooked_header):
@@ -485,28 +658,28 @@ if args.python == False:
            frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:]  # remove two bytes before Protocol
 
         # Testing: remove comment to compare input and output for not modified json
-        if (input_frame_raw != frame_raw):
+        if (args.verbose and input_frame_raw != frame_raw):
             print("Modified frames: ")
             s1 = input_frame_raw
             s2 = frame_raw
             print(s1)
             print(s2)
             if (len(s1) == len(s2)):
-                d = [i for i in xrange(len(s1)) if s1[i] != s2[i]]
+                d = [i for i in range(len(s1)) if s1[i] != s2[i]]
                 print(d)
 
-        hex_to_txt(frame_raw, file)
-
-    f.close()
-    to_pcap_file(infile + '.tmp', sys.argv[1] + '.pcap')
-    os.remove(infile + '.tmp')
+        new_packet = scapy.Packet(bytearray.fromhex(frame_raw))
+        if frame_time:
+            new_packet.time = float(frame_time)
+        pcap_out.write(new_packet)
 
 # Generate python payload only for first packet
 else:
-    file = infile + '.py'
-    f = open(file, 'w')
+    py_outfile = outfile + '.py'
+    f = open(py_outfile, 'w')
 
-    for packet in json:
+    #for packet in json:
+    for packet in ijson.items(data_file, "item", buf_size=200000):
         f.write(py_header)
 
         r = OrderedDict({})
@@ -514,7 +687,7 @@ else:
         #print "packet = " + str(packet['_source']['layers'])
         py_generator(packet['_source']['layers'], r)
 
-        for key, value in r.iteritems() :
+        for key, value in r.items() :
             f.write("    d['" + key + "'] =",)
             f.write(" " + str(value) + "\n")
 
@@ -523,6 +696,6 @@ else:
         # Currently only first packet is used from pcap
         f.close
 
-        print("Generated " + infile + '.py')
+        print("Generated " + py_outfile)
 
         break
author	Martin Kacer <kacer.martin@gmail.com>	2020-06-03 07:34:43 +0200
committer	Dario Lombardo <lomato@gmail.com>	2020-07-16 14:50:09 +0000
commit	9b5f07d8295d1857d2039f6b868e1848ec13bc60 (patch)
tree	b5407e54c250ce60a27552982698f75dd1709854 /tools
parent	3dedaf80648c00a5c1e7b6498d553f793600cc23 (diff)