From 263ebc20a3c19b6071ea387b23e5d46324f6a82a Mon Sep 17 00:00:00 2001 From: Rameen Mahmood Date: Fri, 15 Nov 2024 15:33:56 -0500 Subject: [PATCH 1/3] Add workflows and dependabot config --- .github/dependabot.yaml | 14 +++++++++++ .github/workflows/ci-parse.yml | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 .github/dependabot.yaml create mode 100644 .github/workflows/ci-parse.yml diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 0000000..c3816b9 --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,14 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + assignees: + - "Rameen-Mahmood" + versioning-strategy: "increase-if-necessary" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/ci-parse.yml b/.github/workflows/ci-parse.yml new file mode 100644 index 0000000..be7e2a4 --- /dev/null +++ b/.github/workflows/ci-parse.yml @@ -0,0 +1,45 @@ +name: CI + +on: + push: + branches: + - main + - add-ci + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Cache Python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Install tshark + run: | + sudo apt-get update + sudo apt-get install -y tshark + + - name: Run parse.py on sample data + run: python parse.py output.csv ./tests/sample_pcap_directory \ No newline at end of file From 177ab16f38d68e60ee107f4ba7af3d8938014e5e Mon Sep 17 00:00:00 2001 From: Rameen Mahmood Date: Fri, 15 Nov 2024 16:12:38 -0500 Subject: [PATCH 2/3] Improve hostname resolution --- parse.py | 207 +++++++++++++++++++------------------------------------ 1 file changed, 69 insertions(+), 138 deletions(-) diff --git a/parse.py b/parse.py index 3e7cad2..d7c80e9 100644 --- a/parse.py +++ b/parse.py @@ -1,171 +1,102 @@ -""" -Parses pcap files specified by the user, either as individual files or all files in a directory, -fills in the hostnames, and outputs the results to a csv file. - -Usage: - python parse.py - -Examples: - python parse.py output.csv /path/to/single.pcap - python parse.py output.csv /path/to/pcap_files - -This script uses tshark to parse the pcap files, and verifies that tshark is installed. This script works for *nix. - - -TODO: - - Add support for dealing with ARP spoofing (e.g., as a result of output from IoT Inspector.) -""" import subprocess import pandas as pd from io import StringIO import sys import os import glob -import pathlib import platform import shutil - - +import shelve +import socket if platform.system() == "Darwin": - # Define the path to tshark within the Wireshark.app package TSHARK_PATH = "/Applications/Wireshark.app/Contents/MacOS/tshark" elif os.name == "posix": assert (TSHARK_PATH := shutil.which("tshark", os.X_OK)), "couldn't find tshark" else: sys.exit("This script requires *nix.") -def main(): - # Parse the command line arguments - if len(sys.argv) != 3: - print("Usage: python parse.py ") - return - - output_csv_file = sys.argv[1] - pcap_path = sys.argv[2] - - # Check if the path is a directory or a single file - if os.path.isdir(pcap_path): - pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) - elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): - pcap_files = [pcap_path] - else: - print(f"No valid pcap files found at the specified path: {pcap_path}") - return - - if not pcap_files: - print("No pcap files found.") - return - - # Process each pcap file and concatenate the resultant DataFrames - df_list = [] - for pcap_file in pcap_files: - print(f"Parsing pcap file: {pcap_file}") - df = run_tshark(pcap_file) - if df is not None: - df_list.append(df) - - if not df_list: - print("Failed to parse any pcap files.") - return - - # Combine into a single DataFrame - combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') - - # Maps IP addresses to hostnames - ip_hostname_dict = {} - - # Extract all IP -> hostname mappings from SNI fields - sni_df = combined_df[ - combined_df['tls.handshake.extensions_server_name'].notna() - ] - for (_, row) in sni_df.iterrows(): - ip = row['ip.dst'] - hostname = row['tls.handshake.extensions_server_name'] - ip_hostname_dict[ip] = hostname - - # Extract all IP -> hostname mappings from DNS fields - dns_df = combined_df[ - combined_df['dns.qry.name'].notna() & - combined_df['dns.a'].notna() - ] - for (_, row) in dns_df.iterrows(): - for ip in row['dns.a'].split(','): - hostname = row['dns.qry.name'] - ip_hostname_dict[ip] = hostname - - # Remove the SNI and DNS fields - del combined_df['tls.handshake.extensions_server_name'] - del combined_df['dns.qry.name'] - del combined_df['dns.a'] - - # Fill in the hostnames for each IP address - combined_df['src_hostname'] = combined_df['ip.src'].map( - lambda x: ip_hostname_dict.get(x, None) - ) - combined_df['dst_hostname'] = combined_df['ip.dst'].map( - lambda x: ip_hostname_dict.get(x, None) - ) - - # Write the results to a CSV file - combined_df.to_csv(output_csv_file, index=False) - - -def run_tshark(pcap_file): - """ - Run tshark on a pcap file and return the output as a Pandas DataFrame. - """ - - # Define the fields to extract - fields = [ - 'frame.time_epoch', - 'eth.src', 'eth.dst', - 'ip.src', 'ip.dst', - 'tcp.srcport', 'tcp.dstport', - 'udp.srcport', 'udp.dstport', - '_ws.col.Protocol', 'frame.len', - 'dns.qry.name', 'dns.a', - 'tls.handshake.extensions_server_name' - ] - - # Create the command to run tshark - command = [ - TSHARK_PATH, - '-r', pcap_file, - '-T', 'fields', - '-E', 'header=y', - '-E', 'separator=,', - '-E', 'quote=d', - '-E', 'occurrence=a', - '-2', - '-R', 'not tcp.analysis.retransmission' - ] +unresolvable_ips = [] # List to keep track of unresolvable IP addresses +def main(): + ip_shelve_path = 'ip_hostname_db' + with shelve.open(ip_shelve_path) as ip_shelve: + if len(sys.argv) != 3: + print("Usage: python parse.py ") + return + + output_csv_file = sys.argv[1] + pcap_path = sys.argv[2] + + if os.path.isdir(pcap_path): + pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) + elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): + pcap_files = [pcap_path] + else: + print(f"No valid pcap files found at the specified path: {pcap_path}") + return + + if not pcap_files: + print("No pcap files found.") + return + + df_list = [] + for pcap_file in pcap_files: + df = run_tshark(pcap_file, ip_shelve) + if df is not None: + df_list.append(df) + + if not df_list: + print("Failed to parse any pcap files.") + return + + combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') + combined_df.to_csv(output_csv_file, index=False) + print(f"Output file created: {output_csv_file}") + if unresolvable_ips: + print("Unresolvable IP addresses:", unresolvable_ips) + +def run_tshark(pcap_file, ip_shelve): + command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission'] + fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'] for field in fields: command += ['-e', field] - - # Run the tshark command and capture the output process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() - if process.returncode != 0: print(f"Error running tshark on pcap file: {pcap_file}") print(error.decode()) return None - # Decode the output and read it into a Pandas DataFrame output = output.decode() data = StringIO(output) df = pd.read_csv(data, low_memory=False) - - # Make sure the ports are integers - port_columns = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport'] - for column in port_columns: - if column in df: - df[column] = df[column].fillna(0).astype(int) - + update_ip_hostname_mappings(df, ip_shelve) return df +def update_ip_hostname_mappings(df, ip_shelve): + dns_df = df[df['dns.qry.name'].notna() & df['dns.a'].notna()] + for _, row in dns_df.iterrows(): + ips = row['dns.a'].split(',') + for ip in ips: + ip_shelve[ip] = row['dns.qry.name'] + + sni_df = df[df['tls.handshake.extensions_server_name'].notna()] + for _, row in sni_df.iterrows(): + ip_shelve[row['ip.dst']] = row['tls.handshake.extensions_server_name'] + + df['src_hostname'] = df['ip.src'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP')) + df['dst_hostname'] = df['ip.dst'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP')) + df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True) + +def reverse_dns(ip_address): + if not ip_address: + return '' + try: + hostname = socket.gethostbyaddr(ip_address)[0] + return hostname + except (socket.herror, socket.gaierror): + return '' if __name__ == "__main__": - main() + main() \ No newline at end of file From 1f85054944bf551d3e2e62352e9075c0d664bc72 Mon Sep 17 00:00:00 2001 From: Rameen Mahmood Date: Fri, 15 Nov 2024 16:24:44 -0500 Subject: [PATCH 3/3] Add comments --- parse.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/parse.py b/parse.py index d7c80e9..384d6ff 100644 --- a/parse.py +++ b/parse.py @@ -1,3 +1,20 @@ +""" +Parses pcap files specified by the user, either as individual files or all files in a directory, +fills in the hostnames, and outputs the results to a csv file. + +Usage: + python parse.py + +Examples: + python parse.py output.csv /path/to/single.pcap + python parse.py output.csv /path/to/pcap_files + +This script uses tshark to parse the pcap files, and verifies that tshark is installed. This script works for *nix. + + +TODO: + - Add support for dealing with ARP spoofing (e.g., as a result of output from IoT Inspector.) +""" import subprocess import pandas as pd from io import StringIO @@ -10,6 +27,7 @@ import socket if platform.system() == "Darwin": + # Define the path to tshark within the Wireshark.app package TSHARK_PATH = "/Applications/Wireshark.app/Contents/MacOS/tshark" elif os.name == "posix": assert (TSHARK_PATH := shutil.which("tshark", os.X_OK)), "couldn't find tshark" @@ -19,6 +37,7 @@ unresolvable_ips = [] # List to keep track of unresolvable IP addresses def main(): + # Parse the command line arguments ip_shelve_path = 'ip_hostname_db' with shelve.open(ip_shelve_path) as ip_shelve: if len(sys.argv) != 3: @@ -39,7 +58,7 @@ def main(): if not pcap_files: print("No pcap files found.") return - + # Process each pcap file and concatenate the resultant DataFrames df_list = [] for pcap_file in pcap_files: df = run_tshark(pcap_file, ip_shelve) @@ -57,6 +76,9 @@ def main(): print("Unresolvable IP addresses:", unresolvable_ips) def run_tshark(pcap_file, ip_shelve): + """ + Run tshark on a pcap file and return the output as a Pandas DataFrame. + """ command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission'] fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'] for field in fields: @@ -67,7 +89,8 @@ def run_tshark(pcap_file, ip_shelve): print(f"Error running tshark on pcap file: {pcap_file}") print(error.decode()) return None - + + # Decode the output and read it into a Pandas DataFrame output = output.decode() data = StringIO(output) df = pd.read_csv(data, low_memory=False) @@ -90,6 +113,10 @@ def update_ip_hostname_mappings(df, ip_shelve): df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True) def reverse_dns(ip_address): + """ + Attempts to resolve an IP address to a hostname using a reverse DNS lookup; + This function is used as a fallback mechanism in the event that an IP address does not have a corresponding hostname entry in the shelve database. + """ if not ip_address: return '' try: