diff --git a/setup.py b/setup.py index 484c5f04..d563088c 100644 --- a/setup.py +++ b/setup.py @@ -93,6 +93,7 @@ def get_version(version_file): py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], include_package_data=True, install_requires=[ + "beautifulsoup4", "boto3 == 1.21.10", "botocore == 1.24.10", "chevron == 0.14.0", @@ -109,6 +110,7 @@ def get_version(version_file): "importlib_resources == 5.4.0", "matplotlib == 3.3.4", "mongo-db-from-config@http://github.com/cisagov/mongo-db-from-config/tarball/develop", + "nltk", "openpyxl", "pandas == 1.1.5", "psutil", @@ -125,6 +127,7 @@ def get_version(version_file): "schema == 0.7.5", "setuptools == 58.1.0", "shodan ==1.27.0", + "spacy", "sublist3r", "types-PyYAML == 6.0.4", "urllib3 == 1.26.7", diff --git a/src/adhoc/__init__.py b/src/adhoc/__init__.py new file mode 100644 index 00000000..ababb128 --- /dev/null +++ b/src/adhoc/__init__.py @@ -0,0 +1 @@ +"""Init file for module implementation.""" diff --git a/src/adhoc/data/__init__.py b/src/adhoc/data/__init__.py new file mode 100644 index 00000000..8c5bf486 --- /dev/null +++ b/src/adhoc/data/__init__.py @@ -0,0 +1 @@ +"""Adhoc data init.""" diff --git a/src/adhoc/data/config.py b/src/adhoc/data/config.py new file mode 100644 index 00000000..ae4dc5a7 --- /dev/null +++ b/src/adhoc/data/config.py @@ -0,0 +1,39 @@ +"""Configure database connection.""" +# Standard Python Libraries +from configparser import ConfigParser + + +def config(filename="/home/ubuntu/adhoc/data/database.ini", section="postgresql"): + """Configure postgres.""" + # create a parser + parser = ConfigParser() + # read config file + parser.read(filename) + + # get section, default to postgresql + db = {} + if parser.has_section(section): + params = parser.items(section) + for param in params: + db[param[0]] = param[1] + else: + raise Exception("Section {} not found in the {} file".format(section, filename)) + return db + + +def config2(filename="/home/ubuntu/adhoc/data/database.ini", section="crossfeedDB"): + """Configure Crossfeed.""" + # create a parser + parser = ConfigParser() + # read config file + parser.read(filename) + + # get section, default to postgresql + db = {} + if parser.has_section(section): + params = parser.items(section) + for param in params: + db[param[0]] = param[1] + else: + raise Exception("Section {} not found in the {} file".format(section, filename)) + return db diff --git a/src/adhoc/data/database.ini b/src/adhoc/data/database.ini new file mode 100644 index 00000000..55b95c17 --- /dev/null +++ b/src/adhoc/data/database.ini @@ -0,0 +1,13 @@ +[postgresql] +host= +database= +user= +password= +port= + +[crossfeedDB] +host= +database= +user= +password= +port= diff --git a/src/adhoc/data/run.py b/src/adhoc/data/run.py new file mode 100644 index 00000000..4dc7729a --- /dev/null +++ b/src/adhoc/data/run.py @@ -0,0 +1,335 @@ +"""Database queries.""" +# Standard Python Libraries +import sys + +# Third-Party Libraries +import pandas as pd +import psycopg2 +from psycopg2 import OperationalError +import psycopg2.extras as extras + +from .config import config + +CONN_PARAMS_DIC = config() + + +def show_psycopg2_exception(err): + """Error handleing for postgres issues.""" + err_type, traceback = sys.exc_info() + line_n = traceback.tb_lineno + print("\npsycopg2 ERROR:", err, "on line number:", line_n) + print("psycopg2 traceback:", traceback, "-- type:", err_type) + print("\nextensions.Diagnostics:", err) + print("pgerror:", err) + print("pgcode:", err, "\n") + + +def connect(thread): + """Connect to postgres database.""" + conn = None + try: + conn = psycopg2.connect(**CONN_PARAMS_DIC) + except OperationalError as err: + show_psycopg2_exception(err) + conn = None + return conn + + +def close(conn): + """Close connection.""" + conn.close() + return + + +def execute_values(conn, dataframe, table, except_condition=";"): + """Insert into datafame.""" + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = "INSERT INTO {}({}) VALUES %s" + sql = sql + except_condition + cursor = conn.cursor() + try: + extras.execute_values(cursor, sql.format(table, cols), tpls) + conn.commit() + print("Data inserted using execute_values() successfully..") + except (Exception, psycopg2.DatabaseError) as err: + show_psycopg2_exception(err) + cursor.close() + + +def query_values(conn, table, where=";"): + """Insert of a datafame.""" + sql = "SELECT * FROM {}" + sql = sql + where + # try just pandas... pd..read_sql_query(sql, conn) + df = pd.read_sql_query(sql.format(table), conn) + conn.close() + return df + + +def query_orgs(thread): + """Query orgs.""" + conn = connect(thread) + orgs = query_values(conn, "organizations", " WHERE report_on is True;") + close(conn) + print(orgs) + return orgs + + +def query_roots(conn, org_uid): + """Insert into datafame.""" + sql = "SELECT * FROM root_domains WHERE organizations_uid = '{}'" + # try just pandas... pd..read_sql_query(sql, conn) + df = pd.read_sql_query(sql.format(org_uid), conn) + return df + + +def query_null_roots(conn, org_uid): + """Insert into datafame.""" + sql = "SELECT * FROM root_domains WHERE root_domain = 'Null_Root'" + # try just pandas... pd..read_sql_query(sql, conn) + df = pd.read_sql_query(sql, conn) + return df + + +def execute_hibp_breach_values(conn, dataframe, table): + """Insert into datafame.""" + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (breach_name) + DO UPDATE SET modified_date = EXCLUDED.modified_date;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + print("Data inserted using execute_values() successfully..") + except (Exception, psycopg2.DatabaseError) as err: + show_psycopg2_exception(err) + cursor.close() + + +def execute_hibp_emails_values(conn, dataframe, table): + """Insert into datafame.""" + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (email, breach_name) + DO NOTHING;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + print("Data inserted using execute_values() successfully..") + except (Exception, psycopg2.DatabaseError) as err: + show_psycopg2_exception(err) + cursor.close() + + +# No longer in use +def query_null_subs(conn): + """Insert into datafame.""" + sql = """SELECT o.name, o.organizations_uid, rd.root_domain, rd.root_domain_uid, sd.sub_domain, sd.sub_domain_uid FROM sub_domains as sd + JOIN root_domains as rd ON sd.root_domain_uid = rd.root_domain_uid + JOIN organizations as o ON o.organizations_uid = rd.organizations_uid + WHERE sub_domain = 'Null_Sub'""" + # try just pandas... pd..read_sql_query(sql, conn) + df = pd.read_sql_query(sql, conn) + return df + + +def execute_shodan_data(dataframe, table, thread, org_name, failed): + """Insert shodan data into db.""" + conn = connect(thread) + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (organizations_uid, ip, port, protocol, timestamp) + DO NOTHING;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + print( + f"{thread} Data inserted using execute_values() successfully - {org_name}" + ) + except Exception as e: + print(f"{org_name} failed inserting into {table}") + print(f"{thread} {e} - {org_name}") + failed.append(f"{org_name} failed inserting into {table}") + conn.rollback() + cursor.close() + cursor.close() + return failed + + +def execute_dnsmonitor_data(dataframe, table): + """Execute dns monitor data.""" + conn = connect("") + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (domain_permutation, organizations_uid) + DO UPDATE SET ipv4 = EXCLUDED.ipv4, + ipv6 = EXCLUDED.ipv6, + date_observed = EXCLUDED.date_observed, + mail_server = EXCLUDED.mail_server, + name_server = EXCLUDED.name_server, + sub_domain_uid = EXCLUDED.sub_domain_uid, + data_source_uid = EXCLUDED.data_source_uid;""" + cursor = conn.cursor() + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + print("DNSMonitor Data inserted using execute_values() successfully..") + + +def execute_dnsmonitor_alert_data(dataframe, table): + """Execute alert data.""" + conn = connect("") + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (alert_type, sub_domain_uid, date, new_value) + DO NOTHING;""" + cursor = conn.cursor() + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + print("DNSMonitor Alert Data inserted using execute_values() successfully..") + + +def query_ips(org_id): + """Query IPs.""" + conn = connect("") + sql = """SELECT wa.asset as ip_address + FROM web_assets wa + WHERE wa.organizations_uid = '{}' + and wa.report_on = True + and wa.asset_type = 'ipv4' + """ + # to just return ipv4 change last line to the following: + # and wa.asset_type = 'ipv4' + df = pd.read_sql(sql.format(org_id), conn) + conn.close() + return df + + +def query_orgs_rev(): + """Query orgs in reverse.""" + conn = connect("") + sql = "SELECT * FROM organizations WHERE report_on is True ORDER BY organizations_uid DESC;" + df = pd.read_sql_query(sql, conn) + close(conn) + return df + + +def query_web_assets(conn, org_id): + """Query web assets.""" + sql = """SELECT o.name, o.organizations_uid, wa.asset_type, wa.asset, wa.ip_type, + wa.asset_origin, wa.report_on, wa.last_scanned + FROM web_assets as wa + JOIN organizations o on o.organizations_uid = wa.organizations_uid + WHERE wa.report_on = True + and o.organizations_uid = '{}' + """ + df = pd.read_sql(sql.format(org_id), conn) + + conn.close() + return df + + +# No longer in use +def check_ip(ip): + """Check IPs.""" + conn = connect("") + sql = """SELECT wa.asset as ip, o.name as org FROM web_assets wa + JOIN organizations o on o.organizations_uid = wa.organizations_uid + WHERE wa.asset = '{}'""" + df = pd.read_sql_query(sql.format(ip), conn) + close(conn) + return df + + +def getSubdomain(domain): + """Get subdomain.""" + conn = connect("") + cur = conn.cursor() + sql = """SELECT * FROM sub_domains sd + WHERE sd.sub_domain = '{}'""" + cur.execute(sql.format(domain)) + sub = cur.fetchone() + cur.close() + return sub + + +def getRootdomain(domain): + """Get root domain.""" + conn = connect("") + cur = conn.cursor() + sql = """SELECT * FROM root_domains rd + WHERE rd.root_domain = '{}'""" + cur.execute(sql.format(domain)) + root = cur.fetchone() + cur.close() + return root + + +# ***Scpecifically for DNSMonitor +# TODO: Don't hardcode subdomain uid +def addRootToSubdomain(domain): + """Add root to subdomain.""" + # TODO: getDataSource() + root_domain_uid = getRootdomain(domain)[0] + conn = connect("") + sql = """insert into sub_domains(sub_domain, root_domain_uid, root_domain, data_source_uid) + values ('{}', '{}', '{}','f7229dcc-98a9-11ec-a1c4-02589a36c9d7');""" + cur = conn.cursor() + cur.execute(sql.format(domain, root_domain_uid, domain)) + conn.commit() + close(conn) + print(f"Success adding root domain, {domain}, to subdomains table.") + + +def getDataSource(source): + """Get data source.""" + conn = connect("") + cur = conn.cursor() + sql = """SELECT * FROM data_source WHERE name='{}'""" + cur.execute(sql.format(source)) + source = cur.fetchone() + cur.close() + return source diff --git a/src/adhoc/enumerate_subs_from_root.py b/src/adhoc/enumerate_subs_from_root.py new file mode 100644 index 00000000..67a60bed --- /dev/null +++ b/src/adhoc/enumerate_subs_from_root.py @@ -0,0 +1,111 @@ +"""Script to enumerate subs based on a provided root domain.""" +# Standard Python Libraries +import datetime +import json + +# Third-Party Libraries +import pandas as pd +import requests + +# cisagov Libraries +from pe_reports.data.db_query import connect, execute_values, get_orgs + +API_WHOIS = "at_k5eJoD6do4NSnXL2BY3o1e9BH1t2b" + + +def query_roots(org_uid): + """Query all ips that link to a cidr related to a specific org.""" + print(org_uid) + conn = connect() + sql = """SELECT r.root_domain_uid, r.root_domain FROM root_domains r + where r.organizations_uid = %(org_uid)s + and r.enumerate_subs = True + """ + df = pd.read_sql(sql, conn, params={"org_uid": org_uid}) + conn.close() + return df + + +def execute_subs(conn, dataframe): + """Save subdomains dataframe to the P&E DB.""" + df = dataframe.drop_duplicates() + except_clause = """ ON CONFLICT (sub_domain, root_domain_uid) + DO + NOTHING;""" + execute_values(conn, df, "public.sub_domains", except_clause) + + +def get_data_source_uid(source): + """Get data source uid.""" + conn = connect() + cur = conn.cursor() + sql = """SELECT * FROM data_source WHERE name = '{}'""" + cur.execute(sql.format(source)) + source = cur.fetchone()[0] + cur.close() + cur = conn.cursor() + # Update last_run in data_source table + date = datetime.datetime.today().strftime("%Y-%m-%d") + sql = """update data_source set last_run = '{}' + where name = '{}';""" + cur.execute(sql.format(date, source)) + cur.close() + conn.close() + return source + + +def getSubdomain(domain, root_uid): + """Get all sub-domains from passed in root domain.""" + url = "https://domains-subdomains-discovery.whoisxmlapi.com/api/v1" + payload = json.dumps( + { + "apiKey": f"{API_WHOIS}", + "domains": {"include": [f"{domain}"]}, + "subdomains": {"include": ["*"], "exclude": []}, + } + ) + headers = {"Content-Type": "application/json"} + response = requests.request("POST", url, headers=headers, data=payload) + data = response.json() + subdomains = data["domainsList"] + print(subdomains) + + data_source = get_data_source_uid("WhoisXML") + found_subs = [ + { + "sub_domain": domain, + "root_domain_uid": root_uid, + "data_source_uid": data_source, + } + ] + for sub in subdomains: + if sub != f"www.{domain}": + found_subs.append( + { + "sub_domain": sub, + "root_domain_uid": root_uid, + "data_source_uid": data_source, + } + ) + return found_subs + + +def enumerate_and_save_subs(root_uid, root_domain): + """Enumerate subdomains basedon on a private root.""" + subs = getSubdomain(root_domain, root_uid) + subs = pd.DataFrame(subs) + conn = connect() + execute_subs(conn, subs) + + +def main(): + """Query orgs and run them through the enuemeration function.""" + orgs = get_orgs("") + for i, org in orgs.iterrows(): + roots = query_roots(org["organizations_uid"]) + for j, root in roots.iterrows(): + enumerate_and_save_subs(root["root_domain_uid"], root["root_domain"]) + + +if __name__ == "__main__": + main() diff --git a/src/adhoc/fill_cidrs_from_cyhy_assets.py b/src/adhoc/fill_cidrs_from_cyhy_assets.py new file mode 100644 index 00000000..9b7c3899 --- /dev/null +++ b/src/adhoc/fill_cidrs_from_cyhy_assets.py @@ -0,0 +1,52 @@ +"""Fill CIDRs table from cyhy assets.""" + +# Third-Party Libraries +import pandas as pd + +# cisagov Libraries +from pe_reports.data.db_query import connect + + +def query_cyhy_assets(cyhy_db_id, conn): + """Query cyhy assets.""" + sql = """ + SELECT * + FROM cyhy_db_assets ca + where ca.org_id = %(org_id)s; + """ + + df = pd.read_sql_query(sql, conn, params={"org_id": cyhy_db_id}) + + return df + + +def fill_cidrs(orgs): + """Fill CIDRs.""" + network_count = 0 + + for i, org in orgs.iterrows(): + conn = connect() + # if org['cyhy_db_name'] not in ['DOC', 'DOC_CENSUS']: + # continue + org_id = org["organizations_uid"] + print(org) + networks = query_cyhy_assets(org["cyhy_db_name"], conn) + print(networks) + for j, network in networks.iterrows(): + network_count += 1 + net = network["network"] + print(net) + cur = conn.cursor() + try: + cur.callproc("insert_cidr", (network["network"], org_id, "cyhy_db")) + except Exception as e: + print(e) + continue + + row = cur.fetchone() + print(row) + conn.commit() + cur.close() + conn.close() + + print(network_count) diff --git a/src/adhoc/fill_ips_from_cidrs.py b/src/adhoc/fill_ips_from_cidrs.py new file mode 100644 index 00000000..d00f708d --- /dev/null +++ b/src/adhoc/fill_ips_from_cidrs.py @@ -0,0 +1,86 @@ +"""Fill IPs table from CIDR blocks.""" +# Standard Python Libraries +import hashlib +import ipaddress +import logging + +# Third-Party Libraries +import pandas as pd +import psycopg2 + +# cisagov Libraries +from pe_reports.data.db_query import connect, show_psycopg2_exception + + +def execute_ips(conn, dataframe): + """Insert the ips into the ips table in the database and link them to the associated cidr.""" + for i, row in dataframe.iterrows(): + try: + cur = conn.cursor() + sql = """ + INSERT INTO ips(ip_hash, ip, origin_cidr) VALUES (%s, %s, %s) + ON CONFLICT (ip) + DO + UPDATE SET origin_cidr = UUID(EXCLUDED.origin_cidr); """ + cur.execute(sql, (row["ip_hash"], row["ip"], row["origin_cidr"])) + conn.commit() + except (Exception, psycopg2.DatabaseError) as err: + show_psycopg2_exception(err) + cur.close() + continue + print("IPs inserted using execute_values() successfully..") + + +def query_cidrs(): + """Query all cidrs ordered by length.""" + conn = connect() + sql = """SELECT tc.cidr_uid, tc.network, tc.organizations_uid, tc.insert_alert + FROM cidrs tc + ORDER BY masklen(tc.network) + """ + df = pd.read_sql(sql, conn) + conn.close() + return df + + +def enumerate_ips(cidr, cidr_uid): + """Enumerate all ips for a provided cidr.""" + ips_from_cidrs = [] + print(cidr) + for ip in ipaddress.IPv4Network(cidr): + hash_object = hashlib.sha256(str(ip).encode("utf-8")) + ip_obj = { + "ip_hash": hash_object.hexdigest(), + "ip": str(ip), + "origin_cidr": cidr_uid, + } + ips_from_cidrs.append(ip_obj) + return ips_from_cidrs + + +def fill_ips_from_cidrs(): + """For each cidr enumerate all ips and add them to the ips table.""" + cidrs = query_cidrs() + ips_from_cidrs = [] + for i, cidr in cidrs.iterrows(): + + if cidr["insert_alert"] is not None: + continue + ips_from_cidrs = ips_from_cidrs + enumerate_ips( + cidr["network"], cidr["cidr_uid"] + ) + ips_from_cidrs = pd.DataFrame(ips_from_cidrs) + logging.info(ips_from_cidrs) + logging.info(ips_from_cidrs.drop_duplicates(subset=["ip"])) + conn = connect() + execute_ips(conn, ips_from_cidrs) + print("Succuss adding IPS to Cidrs") + + +def main(): + """Separate and fill database table from cidr block.""" + fill_ips_from_cidrs() + + +if __name__ == "__main__": + main() diff --git a/src/adhoc/link_subs_and_ips_from_ips.py b/src/adhoc/link_subs_and_ips_from_ips.py new file mode 100644 index 00000000..71758d3e --- /dev/null +++ b/src/adhoc/link_subs_and_ips_from_ips.py @@ -0,0 +1,160 @@ +"""Link sub-domains and IPs from IP lookups.""" +# Standard Python Libraries +import datetime +import threading +import time + +# Third-Party Libraries +import numpy as np +import pandas as pd +import requests + +# cisagov Libraries +from pe_reports.data.db_query import connect + + +def reverseLookup(ip, failed_ips): + """Take an ip and find all associated subdomains.""" + # TODO: Add API key + api = "at_k5eJoD6do4NSnXL2BY3o1e9BH1t2b" + url = f"https://dns-history.whoisxmlapi.com/api/v1?apiKey={api}&ip={ip}" + payload = {} + headers = {} + response = requests.request("GET", url, headers=headers, data=payload).json() + if response.get("code") == 429: + response = requests.request("GET", url, headers=headers, data=payload).json() + if response.get("code") == 429: + response = requests.request( + "GET", url, headers=headers, data=payload + ).json() + if response.get("code") == 429: + failed_ips.append(ip) + found_domains = [] + try: + try: + # update last_reverse_lookup field + conn = connect() + cur = conn.cursor() + date = datetime.datetime.today().strftime("%Y-%m-%d") + sql = """update ips set last_reverse_lookup = %s + where ip = %s;""" + cur.execute(sql, (date, str(ip))) + conn.commit() + cur.close() + conn.close() + except Exception as e: + print("failed to update timestamp field") + print(e) + if response["size"] > 0: + + result = response["result"] + for domain in result: + print(domain) + try: + found_domains.append( + { + "sub_domain": domain["name"], + "root": ".".join(domain["name"].rsplit(".")[-2:]), + } + ) + except KeyError: + continue + + except Exception as e: + print(response) + print("failed to return response") + print(e) + return found_domains + + +def query_ips(org_uid): + """Query all ips that link to a cidr related to a specific org.""" + print(org_uid) + conn = connect() + sql = """SELECT i.ip_hash, i.ip, ct.network FROM ips i + JOIN cidrs ct on ct.cidr_uid = i.origin_cidr + where ct.organizations_uid = %(org_uid)s + and i.origin_cidr is not null + and (i.last_reverse_lookup < current_date - interval '7 days' or i.last_reverse_lookup is null) + """ + df = pd.read_sql(sql, conn, params={"org_uid": org_uid}) + conn.close() + return df + + +def link_domain_from_ip(ip_hash, ip, org_uid, data_source, failed_ips): + """From a provided ip find domains and link them in the db.""" + conn = connect() + found_domains = reverseLookup(ip, failed_ips) + for domain in found_domains: + cur = conn.cursor() + cur.callproc( + "link_ips_and_subs", + ( + ip_hash, + ip, + org_uid, + domain["sub_domain"], + data_source, + None, + domain["root"], + ), + ) + row = cur.fetchone() + print(row) + conn.commit() + cur.close() + return 1 + + +def run_ip_chunk(org, ips, thread): + """Run the provided chunk through the linking process.""" + org_uid = org["organizations_uid"] + count = 0 + start_time = time.time() + last_50 = time.time() + failed_ips = [] + for j, ip in ips.iterrows(): + count += 1 + if count % 50 == 0: + print(f"{thread} Currently Running ips: {count}/{len(ips)}") + print(f"{thread} {time.time() - last_50} seconds for the last 50 IPs") + last_50 = time.time() + try: + link_domain_from_ip( + ip["ip_hash"], ip["ip"], org_uid, "WhoisXML", failed_ips + ) + except requests.exceptions.SSLError as e: + print(e) + time.sleep(1) + continue + print(f"{thread} Ips took {time.time() - start_time} to link to subs") + + +def connect_subs_from_ips(orgs): + """For each org find all domains that are associated to an ip and create link in the ip_subs table.""" + for i, org in orgs.iterrows(): + print(f"Running on {org['name']}") + org_uid = org["organizations_uid"] + ips = query_ips(org_uid) + print(ips) + # run_ip_chunk(org,ips,"") + num_chunks = 8 + ips_split = np.array_split(ips, num_chunks) + + x = 0 + thread_list = [] + while x < len(ips_split): + thread_name = f"Thread {x+1}: " + # start thread + t = threading.Thread( + target=run_ip_chunk, args=(org, ips_split[x], thread_name) + ) + t.start() + thread_list.append(t) + x += 1 + + for thread in thread_list: + thread.join() + + print("All threads have finished.") diff --git a/src/adhoc/link_subs_and_ips_from_subs.py b/src/adhoc/link_subs_and_ips_from_subs.py new file mode 100644 index 00000000..dd98574c --- /dev/null +++ b/src/adhoc/link_subs_and_ips_from_subs.py @@ -0,0 +1,65 @@ +"""Link sub-domains and IPs from sub-domain lookups.""" +# Standard Python Libraries +import hashlib +import socket + +# Third-Party Libraries +import pandas as pd + +# cisagov Libraries +from pe_reports.data.db_query import connect + + +def find_ips(domain): + """Find the ip for a provided domain.""" + try: + ip = socket.gethostbyname(domain) + except Exception: + ip = None + print(ip) + return ip + + +def query_subs(org_uid): + """Query all subs for an organization.""" + conn = connect() + sql = """SELECT sd.* FROM sub_domains sd + JOIN root_domains rd on rd.root_domain_uid = sd.root_domain_uid + where rd.organizations_uid = %(org_uid)s + """ + df = pd.read_sql(sql, conn, params={"org_uid": org_uid}) + conn.close() + return df + + +def link_ip_from_domain(sub, root_uid, org_uid, data_source): + """Link IP from domain.""" + conn = connect() + ip = find_ips(sub) + if not ip: + return 0 + hash_object = hashlib.sha256(str(ip).encode("utf-8")) + ip_hash = hash_object.hexdigest() + cur = conn.cursor() + cur.callproc( + "link_ips_and_subs", (ip_hash, ip, org_uid, sub, data_source, root_uid, None) + ) + row = cur.fetchone() + print(row) + conn.commit() + cur.close() + return 1 + + +def connect_ips_from_subs(orgs): + """For each org, find all ips associated with its sub_domains and link them in the ips_subs table.""" + for i, org in orgs.iterrows(): + org_uid = org["organizations_uid"] + subs = query_subs(str(org_uid)) + for i, sub in subs.iterrows(): + sub_domain = sub["sub_domain"] + root_uid = sub["root_domain_uid"] + if sub_domain == "Null_Sub": + continue + link_ip_from_domain(sub_domain, root_uid, org_uid, "unknown") + print("Finished connecting ips from subs") diff --git a/src/adhoc/shodan_dedupe.py b/src/adhoc/shodan_dedupe.py new file mode 100644 index 00000000..a0c5f915 --- /dev/null +++ b/src/adhoc/shodan_dedupe.py @@ -0,0 +1,400 @@ +"""Shodan dedupe script.""" +# Standard Python Libraries +import hashlib +import logging +import time + +# Third-Party Libraries +import pandas as pd +import shodan + +# cisagov Libraries +from pe_reports.data.db_query import close, connect, execute_values, get_orgs_df + +states = [ + "AL", + "AK", + "AZ", + "AR", + "CA", + "CO", + "CT", + "DC", + "DE", + "FL", + "GA", + "HI", + "ID", + "IL", + "IN", + "IA", + "KS", + "KY", + "LA", + "ME", + "MD", + "MA", + "MI", + "MN", + "MS", + "MO", + "MT", + "NE", + "NV", + "NH", + "NJ", + "NM", + "NY", + "NC", + "ND", + "OH", + "OK", + "OR", + "PA", + "RI", + "SC", + "SD", + "TN", + "TX", + "UT", + "VT", + "VA", + "WA", + "WV", + "WI", + "WY", +] +state_names = [ + "Alaska", + "Alabama", + "Arkansas", + "American Samoa", + "Arizona", + "California", + "Colorado", + "Connecticut", + "Delaware", + "Florida", + "Georgia", + "Guam", + "Hawaii", + "Iowa", + "Idaho", + "Illinois", + "Indiana", + "Kansas", + "Kentucky", + "Louisiana", + "Massachusetts", + "Maryland", + "Maine", + "Michigan", + "Minnesota", + "Missouri", + "Mississippi", + "Montana", + "North Carolina", + "North Dakota", + "Nebraska", + "New Hampshire", + "New Jersey", + "New Mexico", + "Nevada", + "New York", + "Ohio", + "Oklahoma", + "Oregon", + "Pennsylvania", + "Puerto Rico", + "Rhode Island", + "South Carolina", + "South Dakota", + "Tennessee", + "Texas", + "Utah", + "Virginia", + "Virgin Islands", + "Vermont", + "Washington", + "Wisconsin", + "West Virginia", + "Wyoming", +] + + +def state_check(host_org): + """Check state.""" + found = False + if host_org: + for state in state_names: + if state in host_org: + return state + return found + + +def query_floating_ips(conn, org_id): + """Query floating IPs.""" + sql = """ + SELECT i.ip + FROM ips i + join ips_subs ip_s on ip_s.ip_hash = i.ip_hash + join sub_domains sd on sd.sub_domain_uid = ip_s.sub_domain_uid + join root_domains rd on rd.root_domain_uid = sd.root_domain_uid + WHERE rd.organizations_uid = %(org_id)s + AND i.origin_cidr is null; + """ + df = pd.read_sql(sql, conn, params={"org_id": org_id}) + ips = set(df["ip"]) + conn.close() + return ips + + +def query_cidrs(conn, org_id): + """Query Cidr.""" + print(org_id) + sql = """ + SELECT network, cidr_uid + FROM cidrs ct + join organizations o on o.organizations_uid = ct.organizations_uid + WHERE o.organizations_uid = %(org_id)s; + """ + df = pd.read_sql(sql, conn, params={"org_id": org_id}) + conn.close() + return df + + +def cidr_dedupe(cidrs, api, org_type): + """Dedupe CIDR.""" + ip_obj = [] + results = [] + for i, cidr in cidrs.iterrows(): + query = f"net:{cidr['network']}" + result = search(api, query, ip_obj, cidr["cidr_uid"], org_type) + results.append(result) + found = len([i for i in results if i != 0]) + logging.info(f"CIDRs with IPs found: {found}") + new_ips = pd.DataFrame(ip_obj) + if len(new_ips) > 0: + new_ips = new_ips.drop_duplicates(subset="ip", keep="first") + conn = connect() + except_clause = """ ON CONFLICT (ip) + DO + UPDATE SET shodan_results = EXCLUDED.shodan_results""" + execute_values(conn, new_ips, "public.ips", except_clause) + close(conn) + + +def ip_dedupe(api, ips, agency_type): + """Count number of IPs with data on Shodan.""" + matched = 0 + ips = list(ips) + float_ips = [] + for i in range(int(len(ips) / 100) + 1): + if (i + 1) * 100 > len(ips): + try: + hosts = api.host(ips[i * 100 : len(ips)]) + except shodan.exception.APIError: + try: + time.sleep(2) + hosts = api.host(ips[i * 100 : len(ips)]) + except Exception: + logging.error(f"{i} failed again") + continue + except shodan.APIError as e: + logging.error("Error: {}".format(e)) + else: + try: + hosts = api.host(ips[i * 100 : (i + 1) * 100]) + except shodan.exception.APIError: + time.sleep(2) + try: + hosts = api.host(ips[i * 100 : (i + 1) * 100]) + except shodan.APIError as err: + print("Error: {}".format(err)) + continue + if isinstance(hosts, list): + for h in hosts: + state = state_check(h["org"]) + hash_object = hashlib.sha256(str(h["ip_str"]).encode("utf-8")) + ip_hash = hash_object.hexdigest() + if state and agency_type == "FEDERAL": + float_ips.append( + { + "ip_hash": ip_hash, + "ip": h["ip_str"], + "shodan_results": False, + "origin_cidr": None, + } + ) + else: + float_ips.append( + { + "ip_hash": ip_hash, + "ip": h["ip_str"], + "shodan_results": True, + "origin_cidr": None, + } + ) + else: + state = state_check(hosts["org"]) + hash_object = hashlib.sha256(str(hosts["ip_str"]).encode("utf-8")) + ip_hash = hash_object.hexdigest() + if state and agency_type == "FEDERAL": + float_ips.append( + { + "ip_hash": ip_hash, + "ip": hosts["ip_str"], + "shodan_results": False, + "origin_cidr": None, + } + ) + else: + float_ips.append( + { + "ip_hash": ip_hash, + "ip": hosts["ip_str"], + "shodan_results": True, + "origin_cidr": None, + } + ) + matched = matched + len(hosts) + new_ips = pd.DataFrame(float_ips) + if len(new_ips) > 0: + new_ips = new_ips.drop_duplicates(subset="ip", keep="first") + conn = connect() + except_clause = """ ON CONFLICT (ip) + DO + UPDATE SET shodan_results = EXCLUDED.shodan_results""" + execute_values(conn, new_ips, "public.ips", except_clause) + close(conn) + + +def search(api, query, ip_obj, cidr_uid, org_type): + """Search Shodan API using query and add IPs to set.""" + # Wrap the request in a try/ except block to catch errors + try: + logging.info(query) + # Search Shodan + try: + results = api.search(query) + except shodan.exception.APIError: + time.sleep(2) + results = api.search(query) + # Show the results + for result in results["matches"]: + # if ":" in result["ip_str"]: + # print("ipv6 found ", result["ip_str"]) + # ip_type = "ipv6" + # else: + # ip_type = "ipv4" + state = state_check(result["org"]) + hash_object = hashlib.sha256(str(result["ip_str"]).encode("utf-8")) + ip_hash = hash_object.hexdigest() + if state and org_type == "FEDERAL": + ip_obj.append( + { + "ip_hash": ip_hash, + "ip": result["ip_str"], + "shodan_results": False, + "origin_cidr": cidr_uid, + } + ) + else: + ip_obj.append( + { + "ip_hash": ip_hash, + "ip": result["ip_str"], + "shodan_results": True, + "origin_cidr": cidr_uid, + } + ) + i = 1 + while i < results["total"] / 100: + try: + # Search Shodan + try: + results = api.search(query=query, page=i) + except shodan.exception.APIError: + time.sleep(2) + results = api.search(query, page=i) + # Show the results + for result in results["matches"]: + # if ":" in result["ip_str"]: + # print("ipv6 found ", result["ip_str"]) + # ip_type = "ipv6" + # else: + # ip_type = "ipv4" + state = state_check(result["org"]) + hash_object = hashlib.sha256(str(result["ip_str"]).encode("utf-8")) + ip_hash = hash_object.hexdigest() + if state and org_type == "FEDERAL": + ip_obj.append( + { + "ip_hash": ip_hash, + "ip": result["ip_str"], + "shodan_results": False, + "origin_cidr": cidr_uid, + } + ) + else: + ip_obj.append( + { + "ip_hash": ip_hash, + "ip": result["ip_str"], + "shodan_results": True, + "origin_cidr": cidr_uid, + } + ) + i = i + 1 + except shodan.APIError as e: + logging.error("Error: {}".format(e)) + logging.error(query) + results = {"total": 0} + except shodan.APIError as e: + logging.error("Error: {}".format(e)) + # IF it breaks to here it fails + logging.error(f"Failed on {query}") + return 0 + return results["total"] + + +def dedupe(orgs): + """Check list of IPs, CIDRs, ASNS, and FQDNs in Shodan and output set of IPs.""" + # get username and password from config file + key = "dkx4uJW66PXjihngybpdWuuJjNF7yxrE" + api = shodan.Shodan(key) + # , 'USTDA', 'VA', 'DHS_ST', 'DOC_BEA', 'DOC_BIS', 'DOC_CENSUS', 'DOC_NIST', 'DOC_NOAA', 'DOC_NTIA', 'DOC_OS', 'DOC_USPTO', 'NRC', 'EAC', 'ED', 'GSEC', 'GSA', 'HHS', 'HHS_NIH', 'NASA', 'OPM', 'CFPB', 'NSF', 'SSS', 'DOC_OIG', 'DOL_BLS' + for i, org in orgs.iterrows(): + # if org['cyhy_db_name'] in [ 'DHS_NPPD', 'DHS_CBP', 'DHS_CIS', 'DHS_FEMA', + # 'DHS_FLETC', 'DHS_ICE', 'DHS_TSA', 'DHS_USSS','DOI_OS-OAS', 'DOC','DOI_OSM', + # 'DOL','DOE','DOS','DOJ','DOT','HHS_FDA','HUD','SBA','DOI_OS','DOI_IBC','DOI', + # 'DOI_NPS','DOI_BIA','DOI_BLM', 'DOI_FWS', 'DOI_BSEE-BOEM-ONRR', 'DOI_BOR', 'EOP', + # 'EPA', 'SSA', 'USAID','USDA']: + # continue + logging.info(f"Running on {org['name']}") + conn = connect() + cidrs = query_cidrs(conn, org["organizations_uid"]) + logging.info(f"{len(cidrs)} cidrs found") + if len(cidrs) > 0: + cidr_dedupe(cidrs, api, org["agency_type"]) + conn = connect() + logging.info("Grabbing floating IPs") + ips = query_floating_ips(conn, org["organizations_uid"]) + logging.info("Got Ips") + if len(ips) > 0: + logging.info("Running dedupe on IPs") + ip_dedupe(api, ips, org["agency_type"]) + logging.info("Finished dedupe") + + +def main(): + """Get all organization information from shodan.""" + orgs = get_orgs_df() + orgs = orgs[orgs["report_on"] is True] + print(orgs) + + dedupe(orgs) + + +if __name__ == "__main__": + main() diff --git a/src/pe_reports/__init__.py b/src/pe_reports/__init__.py index c7a0b3ac..f8f032bf 100644 --- a/src/pe_reports/__init__.py +++ b/src/pe_reports/__init__.py @@ -21,6 +21,8 @@ # Stakeholder views from pe_reports.home.views import home_blueprint from pe_reports.stakeholder.views import stakeholder_blueprint +from pe_reports.stakeholder_full.views import stakeholder_full_blueprint +from pe_reports.stakeholder_lite.views import stakeholder_lite_blueprint from ._version import __version__ # noqa: F401 @@ -75,6 +77,8 @@ # Register the flask apps app.register_blueprint(stakeholder_blueprint) +app.register_blueprint(stakeholder_lite_blueprint) +app.register_blueprint(stakeholder_full_blueprint) # TODO: Add login blueprint. Issue #207 contains details # app.register_blueprint(manage_login_blueprint) app.register_blueprint(home_blueprint) diff --git a/src/pe_reports/data/db_query.py b/src/pe_reports/data/db_query.py index cc28cc36..05f8d78f 100644 --- a/src/pe_reports/data/db_query.py +++ b/src/pe_reports/data/db_query.py @@ -1,6 +1,7 @@ """Query the PE PostgreSQL database.""" # Standard Python Libraries +import logging import sys # Third-Party Libraries @@ -9,14 +10,12 @@ import psycopg2 from psycopg2 import OperationalError from psycopg2.extensions import AsIs - -# cisagov Libraries -from pe_reports import app +import psycopg2.extras as extras from .config import config # Setup logging to central file -LOGGER = app.config["LOGGER"] +LOGGER = logging.getLogger(__name__) CONN_PARAMS_DIC = config() @@ -48,6 +47,22 @@ def close(conn): return +def execute_values(conn, dataframe, table, except_condition=";"): + """INSERT into table, generic.""" + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = "INSERT INTO {}({}) VALUES %s" + sql = sql + except_condition + cursor = conn.cursor() + try: + extras.execute_values(cursor, sql.format(table, cols), tpls) + conn.commit() + print("Data inserted using execute_values() successfully..") + except (Exception, psycopg2.DatabaseError) as err: + show_psycopg2_exception(err) + cursor.close() + + def get_orgs(conn): """Query organizations table.""" try: @@ -64,6 +79,34 @@ def get_orgs(conn): close(conn) +def get_orgs_df(): + """Query organizations table for new orgs.""" + conn = connect() + try: + sql = """SELECT * FROM organizations""" + pe_orgs_df = pd.read_sql(sql, conn) + return pe_orgs_df + except (Exception, psycopg2.DatabaseError) as error: + logging.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + +def get_new_orgs(): + """Query organizations table for new orgs.""" + conn = connect() + try: + sql = """SELECT * FROM organizations WHERE report_on='False'""" + pe_orgs_df = pd.read_sql(sql, conn) + return pe_orgs_df + except (Exception, psycopg2.DatabaseError) as error: + logging.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + def query_creds_view(org_uid, start_date, end_date): """Query credentials view.""" conn = connect() diff --git a/src/pe_reports/stakeholder/templates/stakeholder_UI/home_stakeholder.html b/src/pe_reports/stakeholder/templates/stakeholder_UI/home_stakeholder.html index 900788bf..80fdb293 100644 --- a/src/pe_reports/stakeholder/templates/stakeholder_UI/home_stakeholder.html +++ b/src/pe_reports/stakeholder/templates/stakeholder_UI/home_stakeholder.html @@ -8,71 +8,67 @@ height="200" />
-