aboutsummaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorMartin Kacer <kacer.martin@gmail.com>2020-06-03 07:34:43 +0200
committerDario Lombardo <lomato@gmail.com>2020-07-16 14:50:09 +0000
commit9b5f07d8295d1857d2039f6b868e1848ec13bc60 (patch)
treeb5407e54c250ce60a27552982698f75dd1709854 /tools
parent3dedaf80648c00a5c1e7b6498d553f793600cc23 (diff)
json2pcap: Added pcap masking and anonymization support
The script includes the following changes: - Added pcap masking and anonymization support - Support to mask/anonymize only portion of field - Added reading from stdin - Changed json to ijson library to support large files - Migrated from text2pcap to scapy for pcap generation - Added version to script The development repo is located here https://github.com/H21lab/json2pcap Change-Id: I8fc5e282caa604e188f05818f7a2f8875afb8b73 Reviewed-on: https://code.wireshark.org/review/37371 Reviewed-by: Dario Lombardo <lomato@gmail.com>
Diffstat (limited to 'tools')
-rw-r--r--tools/json2pcap/json2pcap.py367
1 files changed, 270 insertions, 97 deletions
diff --git a/tools/json2pcap/json2pcap.py b/tools/json2pcap/json2pcap.py
index 0f9cebdc0b..3c551be1a2 100644
--- a/tools/json2pcap/json2pcap.py
+++ b/tools/json2pcap/json2pcap.py
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
-# Copyright 2017, Martin Kacer <kacer.martin[AT]gmail.com>
+# Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
@@ -11,7 +11,7 @@
# SPDX-License-Identifier: GPL-2.0-or-later
import sys
-import json
+import ijson
import operator
import copy
import os
@@ -19,7 +19,89 @@ import binascii
import array
import argparse
import subprocess
+import string
+import random
+import math
+import hashlib
+import re
from collections import OrderedDict
+from scapy import all as scapy
+
+try:
+ # Python 2 forward compatibility
+ range = xrange
+except NameError:
+ pass
+
+# Field anonymization class
+class AnonymizedField:
+ '''
+ The Anonymization field object specifying anonymization
+ :filed arg: field name
+ :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256]
+ :start arg: If specified, the anonymization starts at given byte number
+ :end arg: If specified, the anonymization ends at given byte number
+ '''
+ def __init__(self, field, type):
+ self.field = field
+ self.type = type
+ self.start = None
+ self.end = None
+
+ match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field)
+ if match:
+ self.field = match.group(1)
+ self.start = match.group(2)
+ if self.start is not None:
+ self.start = int(self.start)
+ self.end = match.group(3)
+ if self.end is not None:
+ self.end = int(self.end)
+
+ # Returns the new field value after anonymization
+ def anonymize_field_shake256(self, field, type, salt):
+ shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8'))
+
+ # String type, output should be ASCII
+ if type in [26, 27, 28]:
+ length = math.ceil(len(field)/4)
+ shake_hash = shake.hexdigest(length)
+ ret_string = array.array('B', str.encode(shake_hash))
+ ret_string = ''.join('{:02x}'.format(x) for x in ret_string)
+ # Other types, output could be HEX
+ else:
+ length = math.ceil(len(field)/2)
+ shake_hash = shake.hexdigest(length)
+ ret_string = shake_hash
+
+ # Correct the string length
+ if (len(ret_string) < len(field)):
+ ret_string = ret_string.ljust(len(field))
+ if (len(ret_string) > len(field)):
+ ret_string = ret_string[:len(field)]
+
+ return ret_string
+
+ def anonymize_field(self, _h, _t, salt):
+ s = 0
+ e = None
+ if self.start:
+ s = self.start
+ if self.end:
+ e = self.end
+ if e < 0:
+ e = len(_h) + e
+ else:
+ e = len(_h)
+ h = _h[s:e]
+ if self.type == 0:
+ h = 'f' * len(h)
+ elif self.type == 1:
+ h = self.anonymize_field_shake256(h, _t, salt)
+
+ h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:])
+ h = _h[0:s] + h + _h[e:]
+ return [h, h_mask]
def make_unique(key, dct):
counter = 0
@@ -69,7 +151,7 @@ py_header = """#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File generated by json2pcap.py
-# json2pcap.py created by Martin Kacer, 2017
+# json2pcap.py created by Martin Kacer, 2020
import os
import binascii
@@ -77,6 +159,13 @@ import array
import sys
import subprocess
from collections import OrderedDict
+from scapy import all as scapy
+
+try:
+ # Python 2 forward compatibility
+ range = xrange
+except NameError:
+ pass
# *****************************************************
# * PACKET PAYLOAD GENERATED FROM INPUT PCAP *
@@ -94,10 +183,9 @@ py_footer = """ generate_pcap(d)
# *****************************************************
"""
-py_footer = py_footer + read_py_function("to_pcap_file")
-py_footer = py_footer + read_py_function("hex_to_txt")
py_footer = py_footer + read_py_function("to_bytes")
py_footer = py_footer + read_py_function("lsb")
+py_footer = py_footer + read_py_function("multiply_strings")
py_footer = py_footer + read_py_function("rewrite_frame")
py_footer = py_footer + read_py_function("assemble_frame")
py_footer = py_footer + read_py_function("generate_pcap")
@@ -116,25 +204,6 @@ if __name__ == '__main__':
#
# ********** FUNCTIONS ***********
#
-def to_pcap_file(filename, output_pcap_file):
- subprocess.call(["text2pcap", filename, output_pcap_file])
-
-def hex_to_txt(hexstring, output_file):
- h = hexstring.lower()
-
- file = open(output_file, 'a')
-
- for i in range(0, len(h), 2):
- if(i % 32 == 0):
- file.write(format(i / 2, '06x') + ' ')
-
- file.write(h[i:i + 2] + ' ')
-
- if(i % 32 == 30):
- file.write('\n')
-
- file.write('\n')
- file.close()
def raw_flat_collector(dict):
if hasattr(dict, 'items'):
@@ -245,31 +314,46 @@ def py_generator(d, r, frame_name='frame_raw', frame_position=0):
for _v in v:
py_generator(_v, r, frame_name, frame_position)
-
-
-
# To emulate Python 3.2
def to_bytes(n, length, endianess='big'):
h = '%x' % n
- s = ('0' * (len(h) % 2) + h).zfill(length * 2).decode('hex')
+ s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2))
return s if endianess == 'big' else s[::-1]
# Returns the index, counting from 0, of the least significant set bit in x
def lsb(x):
return (x & -x).bit_length() - 1
+# Replace parts of original_string by new_string, only if mask in the byte is not ff
+def multiply_strings(original_string, new_string, mask):
+
+ ret_string = new_string
+ if mask == None:
+ return ret_string
+ for i in range(0, min(len(original_string), len(new_string), len(mask)), 2):
+ if mask[i:i + 2] == 'ff':
+ #print("ff")
+ ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:]
+
+ return ret_string
+
# Rewrite frame
# h - hex bytes
# p - position
# l - length
# b - bitmask
# t - type
-def rewrite_frame(frame_raw, h, p, l, b, t):
+# frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte)
+def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask = None):
+ if p < 0 or l < 0 or h is None:
+ return frame_raw
+
# no bitmask
if(b == 0):
if (len(h) != l):
l = len(h)
- return frame_raw[:p] + h + frame_raw[p + l:]
+ frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:]
+ return multiply_strings(frame_raw, frame_raw_new, frame_amask)
# bitmask
else:
# get hex string from frame which will be replaced
@@ -283,7 +367,7 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
# Only replace bits defined by mask
# new_hex = (old_hex & !mask) | (new_hex & mask)
- _H = _h.decode("hex")
+ _H = bytearray.fromhex(_h)
_H = array.array('B', _H)
M = to_bytes(b, len(_H))
@@ -291,11 +375,11 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
# shift mask aligned to position
for i in range(len(M)):
if (i + p / 2) < len(M):
- M[i] = M[i + p / 2]
+ M[i] = M[i + int(p / 2)]
else:
M[i] = 0x00
- H = h.decode("hex")
+ H = bytearray.fromhex(h)
H = array.array('B', H)
# for i in range(len(_H)):
@@ -319,17 +403,20 @@ def rewrite_frame(frame_raw, h, p, l, b, t):
# print
masked_h = binascii.hexlify(_H)
+ masked_h = masked_h.decode('ascii')
- return frame_raw[:p] + masked_h + frame_raw[p + l:]
+ frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:]
+ return multiply_strings(frame_raw, frame_raw_new, frame_amask)
-def assemble_frame(d):
+def assemble_frame(d, frame_time):
input = d['frame_raw'][1]
isFlat = False
linux_cooked_header = False;
while(isFlat == False):
isFlat = True
- for key, val in d.items():
+ _d = d.copy()
+ for key, val in _d.items():
h = str(val[1]) # hex
p = val[2] * 2 # position
l = val[3] * 2 # length
@@ -353,7 +440,7 @@ def assemble_frame(d):
output = d['frame_raw'][1]
- # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
+ # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame
if (linux_cooked_header):
output = "000000000000" + output[6*2:] # replce dest MAC
output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol
@@ -363,10 +450,9 @@ def assemble_frame(d):
def generate_pcap(d):
# 1. Assemble frame
input = d['frame_raw'][1]
- output = assemble_frame(d)
+ output = assemble_frame(d, None)
print(input)
print(output)
-
# 2. Testing: compare input and output for not modified json
if (input != output):
print("Modified frames: ")
@@ -375,24 +461,23 @@ def generate_pcap(d):
print(s1)
print(s2)
if (len(s1) == len(s2)):
- d = [i for i in xrange(len(s1)) if s1[i] != s2[i]]
+ d = [i for i in range(len(s1)) if s1[i] != s2[i]]
print(d)
-
- # 3. Open TMP file used by text2pcap
- file = sys.argv[0] + '.tmp'
- f = open(file,'w')
- hex_to_txt(output, file)
- f.close()
-
- # 4. Generate pcap
- to_pcap_file(sys.argv[0] + '.tmp', sys.argv[0] + '.pcap')
- print("Generated " + sys.argv[0] + ".tmp")
- print("Generated " + sys.argv[0] + ".pcap")
+ # 3. Generate pcap
+ outfile = sys.argv[0] + ".pcap"
+ pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
+ new_packet = scapy.Packet(bytearray.fromhex(output))
+ pcap_out.write(new_packet)
+ print("Generated " + outfile)
#
# ************ MAIN **************
#
+VERSION = "1.1"
+
parser = argparse.ArgumentParser(description="""
+json2pcap {version}
+
Utility to generate pcap from json format.
Packet modification:
@@ -416,41 +501,107 @@ encode the packet variables. The assembling algorithm is different, because
the decoded packet fields are relative and points to parent node with their
position (compared to input json which has absolute positions).
-""", formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument('infile', nargs='+', help='json generated by tshark -T jsonraw or by tshark -T json -x')
+Pcap masking and anonymization with -m and -a switch:
+The script allows to mask or anonymize the selected json raw fields. If the
+The fields are selected and located on lower protocol layers, they are not
+The overwritten by upper fields which are not marked by these switches.
+The pcap masking and anonymization can be performed in the following way:
+
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw"
+-a "ip.dst_raw" -o anonymized.pcap
+In this example the ip.src_raw field is masked with ffffffff by byte values
+and ip.dst_raw is hashed by randomly generated salt.
+
+Additionally the following syntax is valid to anonymize portion of field
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]"
+-a "ip.dst_raw[:-2]" -o anonymized.pcap
+Where the src_ip first byte is preserved and dst_ip last byte is preserved.
+And the same can be achieved by
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]"
+-a "ip.dst_raw[0:6]" -o anonymized.pcap
+
+Masking and anonymization limitations are mainly the following:
+- In case the tshark is performing reassembling from multiple frames, the
+backward pcap reconstruction is not properly performed and can result in
+malformed frames.
+- The new values in the fields could violate the field format, as the
+json2pcap is no performing correct protocol encoding with respect to
+allowed values of the target field and field encoding.
+
+""".format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
+parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.')
+parser.add_argument('-o', '--outfile', required=True, help='output pcap filename')
parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true')
+parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD')
+parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD')
+parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None)
+parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true')
args = parser.parse_args()
# read JSON
-infile = args.infile[0]
+infile = args.infile
+outfile = args.outfile
-with open(infile) as data_file:
- #json = json.load(data_file, object_pairs_hook=OrderedDict)
- json = json.load(data_file, object_pairs_hook=parse_object_pairs)
+# Read from input file
+if infile:
+ data_file = open(infile)
+# Read from pipe
+else:
+ data_file = sys.stdin
+
+# Parse anonymization fields
+anonymize = {}
+if args.mask:
+ for m in args.mask:
+ if not '_raw' in m:
+ print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix")
+ sys.exit()
+ af = AnonymizedField(m, 0)
+ anonymize[af.field] = af
+if args.anonymize:
+ for a in args.anonymize:
+ if not '_raw' in a:
+ print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix")
+ sys.exit()
+ af = AnonymizedField(a, 1)
+ anonymize[af.field] = af
input_frame_raw = ''
frame_raw = ''
+frame_time = None
+
+salt = args.salt
+if salt is None:
+ # generate random salt if no salt was provided
+ salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10))
# Generate pcap
if args.python == False:
- # open TMP file used by text2pcap
- file = infile + '.tmp'
- f = open(file, 'w')
+ pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
# Iterate over packets in JSON
- for packet in json:
+ for packet in ijson.items(data_file, "item", buf_size=200000):
_list = []
linux_cooked_header = False;
# get flat raw fields into _list
for raw in raw_flat_collector(packet['_source']['layers']):
- if (raw[0] == "frame_raw"):
- frame_raw = raw[1][0]
- input_frame_raw = copy.copy(frame_raw)
- else:
- _list.append(raw[1])
- if (raw[0] == "sll_raw"):
- linux_cooked_header = True
+ if len(raw) >= 2:
+ if (raw[0] == "frame_raw"):
+ frame_raw = raw[1][0]
+ frame_amask = "0"*len(frame_raw) # initialize anonymization mask
+ input_frame_raw = copy.copy(frame_raw)
+ frame_time = None
+ if 'frame.time_epoch' in packet['_source']['layers']['frame']:
+ frame_time = packet['_source']['layers']['frame']['frame.time_epoch']
+ else:
+ # add into value list into raw[5] the field name
+ if isinstance(raw[1], list):
+ raw[1].append(raw[0])
+ _list.append(raw[1])
+ if (raw[0] == "sll_raw"):
+ linux_cooked_header = True
# sort _list
sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False)
@@ -459,25 +610,47 @@ if args.python == False:
# rewrite frame
for raw in sorted_list:
- h = str(raw[0]) # hex
- p = raw[1] * 2 # position
- l = raw[2] * 2 # length
- b = raw[3] # bitmask
- t = raw[4] # type
-
- if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
- for r in raw:
- _h = str(r[0]) # hex
- _p = r[1] * 2 # position
- _l = r[2] * 2 # length
- _b = r[3] # bitmask
- _t = r[4] # type
+ if len(raw) >= 6:
+ h = str(raw[0]) # hex
+ p = raw[1] * 2 # position
+ l = raw[2] * 2 # length
+ b = raw[3] # bitmask
+ t = raw[4] # type
+ # raw[5] # field_name (added by script)
+ h_mask = h # hex for anonymization mask
+
+ # anonymize fields
+ if (raw[5] in anonymize):
+ [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt)
+
+ if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
+ for r in raw:
+ _h = str(r[0]) # hex
+ _p = r[1] * 2 # position
+ _l = r[2] * 2 # length
+ _b = r[3] # bitmask
+ _t = r[4] # type
+ # raw[5] # field_name (added by script)
+ _h_mask = _h # hex for anonymization mask
+
+ # anonymize fields
+ if (raw[5] in anonymize):
+ [_h, _h_mask] = anonymize[raw[5]].anonymize_field(_h, _t, salt)
+
+ # print("Debug: " + str(raw))
+ frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask)
+
+ # update anonymization mask
+ if (raw[5] in anonymize):
+ frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t)
+
+ else:
# print("Debug: " + str(raw))
- frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t)
+ frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask)
- else:
- # print("Debug: " + str(raw))
- frame_raw = rewrite_frame(frame_raw, h, p, l, b, t)
+ # update anonymization mask
+ if (raw[5] in anonymize):
+ frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t)
# for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
if (linux_cooked_header):
@@ -485,28 +658,28 @@ if args.python == False:
frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:] # remove two bytes before Protocol
# Testing: remove comment to compare input and output for not modified json
- if (input_frame_raw != frame_raw):
+ if (args.verbose and input_frame_raw != frame_raw):
print("Modified frames: ")
s1 = input_frame_raw
s2 = frame_raw
print(s1)
print(s2)
if (len(s1) == len(s2)):
- d = [i for i in xrange(len(s1)) if s1[i] != s2[i]]
+ d = [i for i in range(len(s1)) if s1[i] != s2[i]]
print(d)
- hex_to_txt(frame_raw, file)
-
- f.close()
- to_pcap_file(infile + '.tmp', sys.argv[1] + '.pcap')
- os.remove(infile + '.tmp')
+ new_packet = scapy.Packet(bytearray.fromhex(frame_raw))
+ if frame_time:
+ new_packet.time = float(frame_time)
+ pcap_out.write(new_packet)
# Generate python payload only for first packet
else:
- file = infile + '.py'
- f = open(file, 'w')
+ py_outfile = outfile + '.py'
+ f = open(py_outfile, 'w')
- for packet in json:
+ #for packet in json:
+ for packet in ijson.items(data_file, "item", buf_size=200000):
f.write(py_header)
r = OrderedDict({})
@@ -514,7 +687,7 @@ else:
#print "packet = " + str(packet['_source']['layers'])
py_generator(packet['_source']['layers'], r)
- for key, value in r.iteritems() :
+ for key, value in r.items() :
f.write(" d['" + key + "'] =",)
f.write(" " + str(value) + "\n")
@@ -523,6 +696,6 @@ else:
# Currently only first packet is used from pcap
f.close
- print("Generated " + infile + '.py')
+ print("Generated " + py_outfile)
break