diff options
author | Martin Kacer <kacer.martin@gmail.com> | 2020-06-03 07:34:43 +0200 |
---|---|---|
committer | Dario Lombardo <lomato@gmail.com> | 2020-07-16 14:50:09 +0000 |
commit | 9b5f07d8295d1857d2039f6b868e1848ec13bc60 (patch) | |
tree | b5407e54c250ce60a27552982698f75dd1709854 /tools | |
parent | 3dedaf80648c00a5c1e7b6498d553f793600cc23 (diff) |
json2pcap: Added pcap masking and anonymization support
The script includes the following changes:
- Added pcap masking and anonymization support
- Support to mask/anonymize only portion of field
- Added reading from stdin
- Changed json to ijson library to support large files
- Migrated from text2pcap to scapy for pcap generation
- Added version to script
The development repo is located here
https://github.com/H21lab/json2pcap
Change-Id: I8fc5e282caa604e188f05818f7a2f8875afb8b73
Reviewed-on: https://code.wireshark.org/review/37371
Reviewed-by: Dario Lombardo <lomato@gmail.com>
Diffstat (limited to 'tools')
-rw-r--r-- | tools/json2pcap/json2pcap.py | 367 |
1 files changed, 270 insertions, 97 deletions
diff --git a/tools/json2pcap/json2pcap.py b/tools/json2pcap/json2pcap.py index 0f9cebdc0b..3c551be1a2 100644 --- a/tools/json2pcap/json2pcap.py +++ b/tools/json2pcap/json2pcap.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Copyright 2017, Martin Kacer <kacer.martin[AT]gmail.com> +# Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors # # Wireshark - Network traffic analyzer # By Gerald Combs <gerald@wireshark.org> @@ -11,7 +11,7 @@ # SPDX-License-Identifier: GPL-2.0-or-later import sys -import json +import ijson import operator import copy import os @@ -19,7 +19,89 @@ import binascii import array import argparse import subprocess +import string +import random +import math +import hashlib +import re from collections import OrderedDict +from scapy import all as scapy + +try: + # Python 2 forward compatibility + range = xrange +except NameError: + pass + +# Field anonymization class +class AnonymizedField: + ''' + The Anonymization field object specifying anonymization + :filed arg: field name + :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256] + :start arg: If specified, the anonymization starts at given byte number + :end arg: If specified, the anonymization ends at given byte number + ''' + def __init__(self, field, type): + self.field = field + self.type = type + self.start = None + self.end = None + + match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field) + if match: + self.field = match.group(1) + self.start = match.group(2) + if self.start is not None: + self.start = int(self.start) + self.end = match.group(3) + if self.end is not None: + self.end = int(self.end) + + # Returns the new field value after anonymization + def anonymize_field_shake256(self, field, type, salt): + shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8')) + + # String type, output should be ASCII + if type in [26, 27, 28]: + length = math.ceil(len(field)/4) + shake_hash = shake.hexdigest(length) + ret_string = array.array('B', str.encode(shake_hash)) + ret_string = ''.join('{:02x}'.format(x) for x in ret_string) + # Other types, output could be HEX + else: + length = math.ceil(len(field)/2) + shake_hash = shake.hexdigest(length) + ret_string = shake_hash + + # Correct the string length + if (len(ret_string) < len(field)): + ret_string = ret_string.ljust(len(field)) + if (len(ret_string) > len(field)): + ret_string = ret_string[:len(field)] + + return ret_string + + def anonymize_field(self, _h, _t, salt): + s = 0 + e = None + if self.start: + s = self.start + if self.end: + e = self.end + if e < 0: + e = len(_h) + e + else: + e = len(_h) + h = _h[s:e] + if self.type == 0: + h = 'f' * len(h) + elif self.type == 1: + h = self.anonymize_field_shake256(h, _t, salt) + + h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:]) + h = _h[0:s] + h + _h[e:] + return [h, h_mask] def make_unique(key, dct): counter = 0 @@ -69,7 +151,7 @@ py_header = """#!/usr/bin/env python # -*- coding: utf-8 -*- # File generated by json2pcap.py -# json2pcap.py created by Martin Kacer, 2017 +# json2pcap.py created by Martin Kacer, 2020 import os import binascii @@ -77,6 +159,13 @@ import array import sys import subprocess from collections import OrderedDict +from scapy import all as scapy + +try: + # Python 2 forward compatibility + range = xrange +except NameError: + pass # ***************************************************** # * PACKET PAYLOAD GENERATED FROM INPUT PCAP * @@ -94,10 +183,9 @@ py_footer = """ generate_pcap(d) # ***************************************************** """ -py_footer = py_footer + read_py_function("to_pcap_file") -py_footer = py_footer + read_py_function("hex_to_txt") py_footer = py_footer + read_py_function("to_bytes") py_footer = py_footer + read_py_function("lsb") +py_footer = py_footer + read_py_function("multiply_strings") py_footer = py_footer + read_py_function("rewrite_frame") py_footer = py_footer + read_py_function("assemble_frame") py_footer = py_footer + read_py_function("generate_pcap") @@ -116,25 +204,6 @@ if __name__ == '__main__': # # ********** FUNCTIONS *********** # -def to_pcap_file(filename, output_pcap_file): - subprocess.call(["text2pcap", filename, output_pcap_file]) - -def hex_to_txt(hexstring, output_file): - h = hexstring.lower() - - file = open(output_file, 'a') - - for i in range(0, len(h), 2): - if(i % 32 == 0): - file.write(format(i / 2, '06x') + ' ') - - file.write(h[i:i + 2] + ' ') - - if(i % 32 == 30): - file.write('\n') - - file.write('\n') - file.close() def raw_flat_collector(dict): if hasattr(dict, 'items'): @@ -245,31 +314,46 @@ def py_generator(d, r, frame_name='frame_raw', frame_position=0): for _v in v: py_generator(_v, r, frame_name, frame_position) - - - # To emulate Python 3.2 def to_bytes(n, length, endianess='big'): h = '%x' % n - s = ('0' * (len(h) % 2) + h).zfill(length * 2).decode('hex') + s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2)) return s if endianess == 'big' else s[::-1] # Returns the index, counting from 0, of the least significant set bit in x def lsb(x): return (x & -x).bit_length() - 1 +# Replace parts of original_string by new_string, only if mask in the byte is not ff +def multiply_strings(original_string, new_string, mask): + + ret_string = new_string + if mask == None: + return ret_string + for i in range(0, min(len(original_string), len(new_string), len(mask)), 2): + if mask[i:i + 2] == 'ff': + #print("ff") + ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:] + + return ret_string + # Rewrite frame # h - hex bytes # p - position # l - length # b - bitmask # t - type -def rewrite_frame(frame_raw, h, p, l, b, t): +# frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte) +def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask = None): + if p < 0 or l < 0 or h is None: + return frame_raw + # no bitmask if(b == 0): if (len(h) != l): l = len(h) - return frame_raw[:p] + h + frame_raw[p + l:] + frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:] + return multiply_strings(frame_raw, frame_raw_new, frame_amask) # bitmask else: # get hex string from frame which will be replaced @@ -283,7 +367,7 @@ def rewrite_frame(frame_raw, h, p, l, b, t): # Only replace bits defined by mask # new_hex = (old_hex & !mask) | (new_hex & mask) - _H = _h.decode("hex") + _H = bytearray.fromhex(_h) _H = array.array('B', _H) M = to_bytes(b, len(_H)) @@ -291,11 +375,11 @@ def rewrite_frame(frame_raw, h, p, l, b, t): # shift mask aligned to position for i in range(len(M)): if (i + p / 2) < len(M): - M[i] = M[i + p / 2] + M[i] = M[i + int(p / 2)] else: M[i] = 0x00 - H = h.decode("hex") + H = bytearray.fromhex(h) H = array.array('B', H) # for i in range(len(_H)): @@ -319,17 +403,20 @@ def rewrite_frame(frame_raw, h, p, l, b, t): # print masked_h = binascii.hexlify(_H) + masked_h = masked_h.decode('ascii') - return frame_raw[:p] + masked_h + frame_raw[p + l:] + frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:] + return multiply_strings(frame_raw, frame_raw_new, frame_amask) -def assemble_frame(d): +def assemble_frame(d, frame_time): input = d['frame_raw'][1] isFlat = False linux_cooked_header = False; while(isFlat == False): isFlat = True - for key, val in d.items(): + _d = d.copy() + for key, val in _d.items(): h = str(val[1]) # hex p = val[2] * 2 # position l = val[3] * 2 # length @@ -353,7 +440,7 @@ def assemble_frame(d): output = d['frame_raw'][1] - # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap + # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame if (linux_cooked_header): output = "000000000000" + output[6*2:] # replce dest MAC output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol @@ -363,10 +450,9 @@ def assemble_frame(d): def generate_pcap(d): # 1. Assemble frame input = d['frame_raw'][1] - output = assemble_frame(d) + output = assemble_frame(d, None) print(input) print(output) - # 2. Testing: compare input and output for not modified json if (input != output): print("Modified frames: ") @@ -375,24 +461,23 @@ def generate_pcap(d): print(s1) print(s2) if (len(s1) == len(s2)): - d = [i for i in xrange(len(s1)) if s1[i] != s2[i]] + d = [i for i in range(len(s1)) if s1[i] != s2[i]] print(d) - - # 3. Open TMP file used by text2pcap - file = sys.argv[0] + '.tmp' - f = open(file,'w') - hex_to_txt(output, file) - f.close() - - # 4. Generate pcap - to_pcap_file(sys.argv[0] + '.tmp', sys.argv[0] + '.pcap') - print("Generated " + sys.argv[0] + ".tmp") - print("Generated " + sys.argv[0] + ".pcap") + # 3. Generate pcap + outfile = sys.argv[0] + ".pcap" + pcap_out = scapy.PcapWriter(outfile, append=False, sync=False) + new_packet = scapy.Packet(bytearray.fromhex(output)) + pcap_out.write(new_packet) + print("Generated " + outfile) # # ************ MAIN ************** # +VERSION = "1.1" + parser = argparse.ArgumentParser(description=""" +json2pcap {version} + Utility to generate pcap from json format. Packet modification: @@ -416,41 +501,107 @@ encode the packet variables. The assembling algorithm is different, because the decoded packet fields are relative and points to parent node with their position (compared to input json which has absolute positions). -""", formatter_class=argparse.RawTextHelpFormatter) -parser.add_argument('infile', nargs='+', help='json generated by tshark -T jsonraw or by tshark -T json -x') +Pcap masking and anonymization with -m and -a switch: +The script allows to mask or anonymize the selected json raw fields. If the +The fields are selected and located on lower protocol layers, they are not +The overwritten by upper fields which are not marked by these switches. +The pcap masking and anonymization can be performed in the following way: + +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw" +-a "ip.dst_raw" -o anonymized.pcap +In this example the ip.src_raw field is masked with ffffffff by byte values +and ip.dst_raw is hashed by randomly generated salt. + +Additionally the following syntax is valid to anonymize portion of field +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]" +-a "ip.dst_raw[:-2]" -o anonymized.pcap +Where the src_ip first byte is preserved and dst_ip last byte is preserved. +And the same can be achieved by +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]" +-a "ip.dst_raw[0:6]" -o anonymized.pcap + +Masking and anonymization limitations are mainly the following: +- In case the tshark is performing reassembling from multiple frames, the +backward pcap reconstruction is not properly performed and can result in +malformed frames. +- The new values in the fields could violate the field format, as the +json2pcap is no performing correct protocol encoding with respect to +allowed values of the target field and field encoding. + +""".format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) +parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.') +parser.add_argument('-o', '--outfile', required=True, help='output pcap filename') parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true') +parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD') +parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD') +parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None) +parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true') args = parser.parse_args() # read JSON -infile = args.infile[0] +infile = args.infile +outfile = args.outfile -with open(infile) as data_file: - #json = json.load(data_file, object_pairs_hook=OrderedDict) - json = json.load(data_file, object_pairs_hook=parse_object_pairs) +# Read from input file +if infile: + data_file = open(infile) +# Read from pipe +else: + data_file = sys.stdin + +# Parse anonymization fields +anonymize = {} +if args.mask: + for m in args.mask: + if not '_raw' in m: + print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix") + sys.exit() + af = AnonymizedField(m, 0) + anonymize[af.field] = af +if args.anonymize: + for a in args.anonymize: + if not '_raw' in a: + print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix") + sys.exit() + af = AnonymizedField(a, 1) + anonymize[af.field] = af input_frame_raw = '' frame_raw = '' +frame_time = None + +salt = args.salt +if salt is None: + # generate random salt if no salt was provided + salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10)) # Generate pcap if args.python == False: - # open TMP file used by text2pcap - file = infile + '.tmp' - f = open(file, 'w') + pcap_out = scapy.PcapWriter(outfile, append=False, sync=False) # Iterate over packets in JSON - for packet in json: + for packet in ijson.items(data_file, "item", buf_size=200000): _list = [] linux_cooked_header = False; # get flat raw fields into _list for raw in raw_flat_collector(packet['_source']['layers']): - if (raw[0] == "frame_raw"): - frame_raw = raw[1][0] - input_frame_raw = copy.copy(frame_raw) - else: - _list.append(raw[1]) - if (raw[0] == "sll_raw"): - linux_cooked_header = True + if len(raw) >= 2: + if (raw[0] == "frame_raw"): + frame_raw = raw[1][0] + frame_amask = "0"*len(frame_raw) # initialize anonymization mask + input_frame_raw = copy.copy(frame_raw) + frame_time = None + if 'frame.time_epoch' in packet['_source']['layers']['frame']: + frame_time = packet['_source']['layers']['frame']['frame.time_epoch'] + else: + # add into value list into raw[5] the field name + if isinstance(raw[1], list): + raw[1].append(raw[0]) + _list.append(raw[1]) + if (raw[0] == "sll_raw"): + linux_cooked_header = True # sort _list sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False) @@ -459,25 +610,47 @@ if args.python == False: # rewrite frame for raw in sorted_list: - h = str(raw[0]) # hex - p = raw[1] * 2 # position - l = raw[2] * 2 # length - b = raw[3] # bitmask - t = raw[4] # type - - if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))): - for r in raw: - _h = str(r[0]) # hex - _p = r[1] * 2 # position - _l = r[2] * 2 # length - _b = r[3] # bitmask - _t = r[4] # type + if len(raw) >= 6: + h = str(raw[0]) # hex + p = raw[1] * 2 # position + l = raw[2] * 2 # length + b = raw[3] # bitmask + t = raw[4] # type + # raw[5] # field_name (added by script) + h_mask = h # hex for anonymization mask + + # anonymize fields + if (raw[5] in anonymize): + [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt) + + if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))): + for r in raw: + _h = str(r[0]) # hex + _p = r[1] * 2 # position + _l = r[2] * 2 # length + _b = r[3] # bitmask + _t = r[4] # type + # raw[5] # field_name (added by script) + _h_mask = _h # hex for anonymization mask + + # anonymize fields + if (raw[5] in anonymize): + [_h, _h_mask] = anonymize[raw[5]].anonymize_field(_h, _t, salt) + + # print("Debug: " + str(raw)) + frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask) + + # update anonymization mask + if (raw[5] in anonymize): + frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t) + + else: # print("Debug: " + str(raw)) - frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t) + frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask) - else: - # print("Debug: " + str(raw)) - frame_raw = rewrite_frame(frame_raw, h, p, l, b, t) + # update anonymization mask + if (raw[5] in anonymize): + frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t) # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap if (linux_cooked_header): @@ -485,28 +658,28 @@ if args.python == False: frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:] # remove two bytes before Protocol # Testing: remove comment to compare input and output for not modified json - if (input_frame_raw != frame_raw): + if (args.verbose and input_frame_raw != frame_raw): print("Modified frames: ") s1 = input_frame_raw s2 = frame_raw print(s1) print(s2) if (len(s1) == len(s2)): - d = [i for i in xrange(len(s1)) if s1[i] != s2[i]] + d = [i for i in range(len(s1)) if s1[i] != s2[i]] print(d) - hex_to_txt(frame_raw, file) - - f.close() - to_pcap_file(infile + '.tmp', sys.argv[1] + '.pcap') - os.remove(infile + '.tmp') + new_packet = scapy.Packet(bytearray.fromhex(frame_raw)) + if frame_time: + new_packet.time = float(frame_time) + pcap_out.write(new_packet) # Generate python payload only for first packet else: - file = infile + '.py' - f = open(file, 'w') + py_outfile = outfile + '.py' + f = open(py_outfile, 'w') - for packet in json: + #for packet in json: + for packet in ijson.items(data_file, "item", buf_size=200000): f.write(py_header) r = OrderedDict({}) @@ -514,7 +687,7 @@ else: #print "packet = " + str(packet['_source']['layers']) py_generator(packet['_source']['layers'], r) - for key, value in r.iteritems() : + for key, value in r.items() : f.write(" d['" + key + "'] =",) f.write(" " + str(value) + "\n") @@ -523,6 +696,6 @@ else: # Currently only first packet is used from pcap f.close - print("Generated " + infile + '.py') + print("Generated " + py_outfile) break |