Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement persistent IP-to-hostname mapping #15

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/dependabot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
assignees:
- "Rameen-Mahmood"
versioning-strategy: "increase-if-necessary"

- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
45 changes: 45 additions & 0 deletions .github/workflows/ci-parse.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: CI

on:
push:
branches:
- main
- add-ci
pull_request:
branches:
- main
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Check out the repository
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: Cache Python packages
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Install tshark
run: |
sudo apt-get update
sudo apt-get install -y tshark

- name: Run parse.py on sample data
run: python parse.py output.csv ./tests/sample_pcap_directory
188 changes: 73 additions & 115 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@
import sys
import os
import glob
import pathlib
import platform
import shutil


import shelve
import socket

if platform.system() == "Darwin":
# Define the path to tshark within the Wireshark.app package
Expand All @@ -35,137 +34,96 @@
else:
sys.exit("This script requires *nix.")

unresolvable_ips = [] # List to keep track of unresolvable IP addresses

def main():
# Parse the command line arguments
if len(sys.argv) != 3:
print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>")
return

output_csv_file = sys.argv[1]
pcap_path = sys.argv[2]

# Check if the path is a directory or a single file
if os.path.isdir(pcap_path):
pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap'))
elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'):
pcap_files = [pcap_path]
else:
print(f"No valid pcap files found at the specified path: {pcap_path}")
return

if not pcap_files:
print("No pcap files found.")
return

# Process each pcap file and concatenate the resultant DataFrames
df_list = []
for pcap_file in pcap_files:
print(f"Parsing pcap file: {pcap_file}")
df = run_tshark(pcap_file)
if df is not None:
df_list.append(df)

if not df_list:
print("Failed to parse any pcap files.")
return

# Combine into a single DataFrame
combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch')

# Maps IP addresses to hostnames
ip_hostname_dict = {}

# Extract all IP -> hostname mappings from SNI fields
sni_df = combined_df[
combined_df['tls.handshake.extensions_server_name'].notna()
]
for (_, row) in sni_df.iterrows():
ip = row['ip.dst']
hostname = row['tls.handshake.extensions_server_name']
ip_hostname_dict[ip] = hostname

# Extract all IP -> hostname mappings from DNS fields
dns_df = combined_df[
combined_df['dns.qry.name'].notna() &
combined_df['dns.a'].notna()
]
for (_, row) in dns_df.iterrows():
for ip in row['dns.a'].split(','):
hostname = row['dns.qry.name']
ip_hostname_dict[ip] = hostname

# Remove the SNI and DNS fields
del combined_df['tls.handshake.extensions_server_name']
del combined_df['dns.qry.name']
del combined_df['dns.a']

# Fill in the hostnames for each IP address
combined_df['src_hostname'] = combined_df['ip.src'].map(
lambda x: ip_hostname_dict.get(x, None)
)
combined_df['dst_hostname'] = combined_df['ip.dst'].map(
lambda x: ip_hostname_dict.get(x, None)
)

# Write the results to a CSV file
combined_df.to_csv(output_csv_file, index=False)


def run_tshark(pcap_file):
ip_shelve_path = 'ip_hostname_db'
with shelve.open(ip_shelve_path) as ip_shelve:
if len(sys.argv) != 3:
print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>")
return

output_csv_file = sys.argv[1]
pcap_path = sys.argv[2]

if os.path.isdir(pcap_path):
pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap'))
elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'):
pcap_files = [pcap_path]
else:
print(f"No valid pcap files found at the specified path: {pcap_path}")
return

if not pcap_files:
print("No pcap files found.")
return
# Process each pcap file and concatenate the resultant DataFrames
df_list = []
for pcap_file in pcap_files:
df = run_tshark(pcap_file, ip_shelve)
if df is not None:
df_list.append(df)

if not df_list:
print("Failed to parse any pcap files.")
return

combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch')
combined_df.to_csv(output_csv_file, index=False)
print(f"Output file created: {output_csv_file}")
if unresolvable_ips:
print("Unresolvable IP addresses:", unresolvable_ips)

def run_tshark(pcap_file, ip_shelve):
"""
Run tshark on a pcap file and return the output as a Pandas DataFrame.
"""

# Define the fields to extract
fields = [
'frame.time_epoch',
'eth.src', 'eth.dst',
'ip.src', 'ip.dst',
'tcp.srcport', 'tcp.dstport',
'udp.srcport', 'udp.dstport',
'_ws.col.Protocol', 'frame.len',
'dns.qry.name', 'dns.a',
'tls.handshake.extensions_server_name'
]

# Create the command to run tshark
command = [
TSHARK_PATH,
'-r', pcap_file,
'-T', 'fields',
'-E', 'header=y',
'-E', 'separator=,',
'-E', 'quote=d',
'-E', 'occurrence=a',
'-2',
'-R', 'not tcp.analysis.retransmission'
]

command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission']
fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name']
for field in fields:
command += ['-e', field]

# Run the tshark command and capture the output
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()

if process.returncode != 0:
print(f"Error running tshark on pcap file: {pcap_file}")
print(error.decode())
return None

# Decode the output and read it into a Pandas DataFrame
output = output.decode()
data = StringIO(output)
df = pd.read_csv(data, low_memory=False)
update_ip_hostname_mappings(df, ip_shelve)
return df

# Make sure the ports are integers
port_columns = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport']
for column in port_columns:
if column in df:
df[column] = df[column].fillna(0).astype(int)
def update_ip_hostname_mappings(df, ip_shelve):
dns_df = df[df['dns.qry.name'].notna() & df['dns.a'].notna()]
for _, row in dns_df.iterrows():
ips = row['dns.a'].split(',')
for ip in ips:
ip_shelve[ip] = row['dns.qry.name']

return df
sni_df = df[df['tls.handshake.extensions_server_name'].notna()]
for _, row in sni_df.iterrows():
ip_shelve[row['ip.dst']] = row['tls.handshake.extensions_server_name']

df['src_hostname'] = df['ip.src'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP'))
df['dst_hostname'] = df['ip.dst'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP'))
df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True)

def reverse_dns(ip_address):
"""
Attempts to resolve an IP address to a hostname using a reverse DNS lookup;
This function is used as a fallback mechanism in the event that an IP address does not have a corresponding hostname entry in the shelve database.
"""
if not ip_address:
return ''
try:
hostname = socket.gethostbyaddr(ip_address)[0]
return hostname
except (socket.herror, socket.gaierror):
return ''

if __name__ == "__main__":
main()
main()
Loading