diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 0000000..c3816b9 --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,14 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + assignees: + - "Rameen-Mahmood" + versioning-strategy: "increase-if-necessary" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/ci-parse.yml b/.github/workflows/ci-parse.yml new file mode 100644 index 0000000..be7e2a4 --- /dev/null +++ b/.github/workflows/ci-parse.yml @@ -0,0 +1,45 @@ +name: CI + +on: + push: + branches: + - main + - add-ci + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Cache Python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Install tshark + run: | + sudo apt-get update + sudo apt-get install -y tshark + + - name: Run parse.py on sample data + run: python parse.py output.csv ./tests/sample_pcap_directory \ No newline at end of file diff --git a/parse.py b/parse.py index 3e7cad2..384d6ff 100644 --- a/parse.py +++ b/parse.py @@ -21,11 +21,10 @@ import sys import os import glob -import pathlib import platform import shutil - - +import shelve +import socket if platform.system() == "Darwin": # Define the path to tshark within the Wireshark.app package @@ -35,137 +34,96 @@ else: sys.exit("This script requires *nix.") +unresolvable_ips = [] # List to keep track of unresolvable IP addresses + def main(): # Parse the command line arguments - if len(sys.argv) != 3: - print("Usage: python parse.py ") - return - - output_csv_file = sys.argv[1] - pcap_path = sys.argv[2] - - # Check if the path is a directory or a single file - if os.path.isdir(pcap_path): - pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) - elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): - pcap_files = [pcap_path] - else: - print(f"No valid pcap files found at the specified path: {pcap_path}") - return - - if not pcap_files: - print("No pcap files found.") - return - - # Process each pcap file and concatenate the resultant DataFrames - df_list = [] - for pcap_file in pcap_files: - print(f"Parsing pcap file: {pcap_file}") - df = run_tshark(pcap_file) - if df is not None: - df_list.append(df) - - if not df_list: - print("Failed to parse any pcap files.") - return - - # Combine into a single DataFrame - combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') - - # Maps IP addresses to hostnames - ip_hostname_dict = {} - - # Extract all IP -> hostname mappings from SNI fields - sni_df = combined_df[ - combined_df['tls.handshake.extensions_server_name'].notna() - ] - for (_, row) in sni_df.iterrows(): - ip = row['ip.dst'] - hostname = row['tls.handshake.extensions_server_name'] - ip_hostname_dict[ip] = hostname - - # Extract all IP -> hostname mappings from DNS fields - dns_df = combined_df[ - combined_df['dns.qry.name'].notna() & - combined_df['dns.a'].notna() - ] - for (_, row) in dns_df.iterrows(): - for ip in row['dns.a'].split(','): - hostname = row['dns.qry.name'] - ip_hostname_dict[ip] = hostname - - # Remove the SNI and DNS fields - del combined_df['tls.handshake.extensions_server_name'] - del combined_df['dns.qry.name'] - del combined_df['dns.a'] - - # Fill in the hostnames for each IP address - combined_df['src_hostname'] = combined_df['ip.src'].map( - lambda x: ip_hostname_dict.get(x, None) - ) - combined_df['dst_hostname'] = combined_df['ip.dst'].map( - lambda x: ip_hostname_dict.get(x, None) - ) - - # Write the results to a CSV file - combined_df.to_csv(output_csv_file, index=False) - - -def run_tshark(pcap_file): + ip_shelve_path = 'ip_hostname_db' + with shelve.open(ip_shelve_path) as ip_shelve: + if len(sys.argv) != 3: + print("Usage: python parse.py ") + return + + output_csv_file = sys.argv[1] + pcap_path = sys.argv[2] + + if os.path.isdir(pcap_path): + pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) + elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): + pcap_files = [pcap_path] + else: + print(f"No valid pcap files found at the specified path: {pcap_path}") + return + + if not pcap_files: + print("No pcap files found.") + return + # Process each pcap file and concatenate the resultant DataFrames + df_list = [] + for pcap_file in pcap_files: + df = run_tshark(pcap_file, ip_shelve) + if df is not None: + df_list.append(df) + + if not df_list: + print("Failed to parse any pcap files.") + return + + combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') + combined_df.to_csv(output_csv_file, index=False) + print(f"Output file created: {output_csv_file}") + if unresolvable_ips: + print("Unresolvable IP addresses:", unresolvable_ips) + +def run_tshark(pcap_file, ip_shelve): """ Run tshark on a pcap file and return the output as a Pandas DataFrame. """ - - # Define the fields to extract - fields = [ - 'frame.time_epoch', - 'eth.src', 'eth.dst', - 'ip.src', 'ip.dst', - 'tcp.srcport', 'tcp.dstport', - 'udp.srcport', 'udp.dstport', - '_ws.col.Protocol', 'frame.len', - 'dns.qry.name', 'dns.a', - 'tls.handshake.extensions_server_name' - ] - - # Create the command to run tshark - command = [ - TSHARK_PATH, - '-r', pcap_file, - '-T', 'fields', - '-E', 'header=y', - '-E', 'separator=,', - '-E', 'quote=d', - '-E', 'occurrence=a', - '-2', - '-R', 'not tcp.analysis.retransmission' - ] - + command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission'] + fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'] for field in fields: command += ['-e', field] - - # Run the tshark command and capture the output process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() - if process.returncode != 0: print(f"Error running tshark on pcap file: {pcap_file}") print(error.decode()) return None - + # Decode the output and read it into a Pandas DataFrame output = output.decode() data = StringIO(output) df = pd.read_csv(data, low_memory=False) + update_ip_hostname_mappings(df, ip_shelve) + return df - # Make sure the ports are integers - port_columns = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport'] - for column in port_columns: - if column in df: - df[column] = df[column].fillna(0).astype(int) +def update_ip_hostname_mappings(df, ip_shelve): + dns_df = df[df['dns.qry.name'].notna() & df['dns.a'].notna()] + for _, row in dns_df.iterrows(): + ips = row['dns.a'].split(',') + for ip in ips: + ip_shelve[ip] = row['dns.qry.name'] - return df + sni_df = df[df['tls.handshake.extensions_server_name'].notna()] + for _, row in sni_df.iterrows(): + ip_shelve[row['ip.dst']] = row['tls.handshake.extensions_server_name'] + df['src_hostname'] = df['ip.src'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP')) + df['dst_hostname'] = df['ip.dst'].map(lambda x: ip_shelve.get(x, reverse_dns(x) if x else 'No IP')) + df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True) + +def reverse_dns(ip_address): + """ + Attempts to resolve an IP address to a hostname using a reverse DNS lookup; + This function is used as a fallback mechanism in the event that an IP address does not have a corresponding hostname entry in the shelve database. + """ + if not ip_address: + return '' + try: + hostname = socket.gethostbyaddr(ip_address)[0] + return hostname + except (socket.herror, socket.gaierror): + return '' if __name__ == "__main__": - main() + main() \ No newline at end of file