diff --git a/setup-client.py b/client-setup.py similarity index 98% rename from setup-client.py rename to client-setup.py index fcb8ce7..f86ddcc 100755 --- a/setup-client.py +++ b/client-setup.py @@ -37,6 +37,7 @@ ERROR_DIR = CLIENT_DIR / "error" DIRS_TO_WRITE = [ + WORKSPACE_DIR, CLIENT_DIR, RECEIVED_DIR, CRAWLING_START_DIR, diff --git a/dispatch.py b/dispatch.py index 9ecf9be..585e1ea 100755 --- a/dispatch.py +++ b/dispatch.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 import argparse +import ipaddress import multiprocessing import pathlib import sys +import pgcrawl.consts as PG_CONSTS import pgcrawl.dispatch as PG_DISPATCH from pgcrawl.logging import log @@ -12,31 +14,47 @@ PARSER = argparse.ArgumentParser( prog="pagegraph-tranco-crawl dispatcher", description="Script responsible for dispatching and coordinating calls " - "to child SSH servers.", + "to child SSH server.\n" + "The full set of steps to take to set up a client from " + "scratch are: --test-connection, --install-client-code, " + "--check-client-code, --setup-client-code", formatter_class=argparse.ArgumentDefaultsHelpFormatter) PARSER.add_argument( - "-f", "--filepath", - default="./ips.txt", - help="Path to a file of IP addresses, describing the child servers to " - "use to crawl pages.") + "ip", + help="The IP address of the client server to interact with.") PARSER.add_argument( - "--test-connections", - default=False, - action="store_true", - help="Test connections to child servers, but don't make any changes.") + "--test-connection", + default=False, + action="store_true", + help="Test connection to child server, but don't make any changes.") PARSER.add_argument( - "--install-client-code", - default=False, - action="store_true", - help="Install this crawling code on child servers, and then quit.") + "--delete-client-code", + default=False, + action="store_true", + help="Install code at `--client-code-path` if present.") PARSER.add_argument( - "--client-code-path", - default="~/pagegraph-tranco-crawl, - help="Relative path to where this code should be installed on each client.") + "--install-client-code", + default=False, + action="store_true", + help="Install this crawling code on child server.") PARSER.add_argument( - "-u", "--user", - default="ubuntu", - help="The user to use when SSH'ing to a client server.") + "--check-client-code", + default=False, + action="store_true", + help="Check this crawling code is installed on child server, then quit.") +PARSER.add_argument( + "--setup-client-code", + default=False, + action="store_true", + help="Run the set up script for this code on the child server.") +PARSER.add_argument( + "--client-code-path", + default=PG_CONSTS.DEFAULT_CLIENT_CODE_PATH, + help="Path to where this code should be installed on the client.") +PARSER.add_argument( + "-u", "--user", + default="ubuntu", + help="The user to use when SSH'ing to a client server.") PARSER.add_argument( "--quiet", "-q", default=False, @@ -46,21 +64,39 @@ ARGS = PARSER.parse_args() QUIET = ARGS.quiet USER = ARGS.user +CLIENT_PATH = ARGS.client_code_path + +IP = ipaddress.ip_address(ARGS.ip) + +if ARGS.test_connection: + log(f"Checking connection to {IP}.") + if not PG_DISPATCH.test_connection(USER, IP, QUIET): + sys.exit(1) + sys.exit(0) -FILEPATH = pathlib.Path(ARGS.filepath) +if ARGS.delete_client_code: + log(f"Attempting to delete client code from {IP}:{CLIENT_PATH}.") + if not PG_DISPATCH.delete_client_code(USER, IP, CLIENT_PATH, QUIET): + sys.exit(1) + sys.exit(0) -IP_ADDRESSES = PG_DISPATCH.read_ips(FILEPATH) -if ARGS.test_connections: - log(f"Checking connections to {len(IP_ADDRESSES)} child servers") - for ip in IP_ADDRESSES: - if not PG_DISPATCH.test_connection(USER, ip, QUIET): - sys.exit(1) - sys.exit(0) +if ARGS.check_client_code: + log(f"Checking if client code is installed on {IP}:{CLIENT_PATH}.") + if not PG_DISPATCH.check_client_code(USER, IP, CLIENT_PATH, QUIET): + sys.exit(1) + sys.exit(0) -if ARGS.install_client: - CLIENT_PATH = ARGS.client_code_path - log(f"Installing on {len(IP_ADDRESSES)} servers, at {CLIENT_PATH}") +if ARGS.install_client_code: + log(f"Attempting to install client code on {IP}:{CLIENT_PATH}.") + if not PG_DISPATCH.install_client_code(USER, IP, CLIENT_PATH, QUIET): + sys.exit(1) + sys.exit(0) +if ARGS.setup_client_code: + log(f"Attempting to run setup / init code on {IP}:{CLIENT_PATH}.") + if not PG_DISPATCH.setup_client_code(USER, IP, CLIENT_PATH, QUIET): + sys.exit(1) + sys.exit(0) # def name_to_parts(name): @@ -90,11 +126,12 @@ # workload = itertools.islice(todo_files, index, index + chunk_size) # parsed_workload = [name_to_parts(x.name) for x in workload] # print(parsed_workload) -# commands = [f"python3 /home/ubuntu/crawl.py {domain} {rank}" for rank, domain in parsed_workload] +# commands = [f"python3 /home/ubuntu/crawl.py {domain} {rank}" +# for rank, domain in parsed_workload] # ip_cmds = zip(instance_ips, commands) # with multiprocessing.Pool(chunk_size) as p: # p.map(run_cmd, ip_cmds) # total += len(list(workload)) # index += chunk_size # break -# print(total) \ No newline at end of file +# print(total) diff --git a/setup-dispatcher.py b/dispatcher-setup.py similarity index 100% rename from setup-dispatcher.py rename to dispatcher-setup.py diff --git a/ips.txt b/ips.txt deleted file mode 100644 index b7102aa..0000000 --- a/ips.txt +++ /dev/null @@ -1 +0,0 @@ -# Add the IPs that should be SSH'ed and used to crawl here. \ No newline at end of file diff --git a/lint.sh b/lint.sh index 82bd54e..646e45e 100755 --- a/lint.sh +++ b/lint.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -LOCAL_SCRIPTS="setup-client.py setup-dispatcher.py pgcrawl/*.py" +LOCAL_SCRIPTS="client-setup.py dispatcher-setup.py dispatch.py pgcrawl/*.py" pycodestyle $LOCAL_SCRIPTS mypy --strict $LOCAL_SCRIPTS diff --git a/pgcrawl/consts.py b/pgcrawl/consts.py index a5d7c3e..f3b06c1 100644 --- a/pgcrawl/consts.py +++ b/pgcrawl/consts.py @@ -1,7 +1,7 @@ import pathlib -GIT_URL = "https://github.com/brave-experiments/pagegraph-tranco-crawl" +GIT_URL = "git@github.com:brave-experiments/pagegraph-tranco-crawl.git" PG_CRAWL_GIT_URL = "https://github.com/brave/pagegraph-crawl" BRAVE_INSTALL_SCRIPT = pathlib.Path("./scripts/install-brave-nightly.sh") -DEFAULT_IP_FILE = pathlib.Path("./ips.txt") +DEFAULT_CLIENT_CODE_PATH = "~/pagegraph-tranco-crawl" diff --git a/pgcrawl/dispatch.py b/pgcrawl/dispatch.py index 74fbf50..3738104 100644 --- a/pgcrawl/dispatch.py +++ b/pgcrawl/dispatch.py @@ -1,49 +1,122 @@ +from io import StringIO from ipaddress import ip_address, IPv4Address, IPv6Address import json from pathlib import Path from fabric import Connection -from pgcrawl.consts import DEFAULT_IP_FILE +from pgcrawl.consts import GIT_URL from pgcrawl.logging import log, error -def test_connection(user: str, ip: IPv4Address | IPv6Address, - quiet: bool = False) -> bool: - expected_user_dir = "/home/" + user - test_cmd = f"test -d {expected_user_dir}" +def run(conn: Connection, cmd: str, quiet: bool) -> bool: + log(f"* calling {conn.user}@{conn.host}: {cmd}", quiet) try: - log(f"Testing existence of to {user}@{ip}:{expected_user_dir}") - rs = Connection(host=str(ip), user=user).run(test_cmd) - log(rs.stdout, quiet) + stdout_stream = StringIO() + stderr_stream = StringIO() + rs = conn.run(cmd, warn=True, hide=True, out_stream=stdout_stream, + err_stream=stderr_stream) + log(stdout_stream.getvalue(), quiet) if rs.exited != 0: - error(rs.stderr) + error(stdout_stream.getvalue()) return False - else: - log("...connected successfully!") - return True + return True except Exception as e: error(str(e)) return False +def activate_env_cmd_str(client_path: str) -> str: + project_name = Path(client_path).name + commands = [ + "cd ~/", + f"cd {client_path}", + ". ./bin/activate", + f"cd {project_name}" + ] + activate_env_cmd = " && ".join(commands) + return activate_env_cmd + + +def test_connection(user: str, ip: IPv4Address | IPv6Address, + quiet: bool = False) -> bool: + conn = Connection(host=str(ip), user=user) + expected_user_dir = "/home/" + user + test_cmd = f"test -d {expected_user_dir}" + if run(conn, test_cmd, quiet): + conn.close() + log(" connected successfully!", quiet) + return True + return False + + +def check_client_code(user: str, ip: IPv4Address | IPv6Address, + client_path: str, + quiet: bool = False) -> bool: + conn = Connection(host=str(ip), user=user) + check_install_cmd = f"test -d {client_path}" + + rs = run(conn, check_install_cmd, quiet) + conn.close() + if rs: + log(" Looks already installed!", quiet) + return True + else: + log(" client code does not seem present", quiet) + return False + + +def delete_client_code(user: str, ip: IPv4Address | IPv6Address, + client_path: str, + quiet: bool = False) -> bool: + conn = Connection(host=str(ip), user=user) + delete_install = f"rm -Rf {client_path}" + rs = run(conn, delete_install, quiet) + conn.close() + if rs: + log(" installed deleted!", quiet) + return True + else: + log(" error when deleting", quiet) + return False + + def install_client_code(user: str, ip: IPv4Address | IPv6Address, client_path: str, quiet: bool = False) -> bool: - return True + intended_dest = Path(client_path).name + commands = [ + f"python3 -m venv {intended_dest}", + f"cd {intended_dest}", + f". ./bin/activate", + f"git clone {GIT_URL} {intended_dest}", + f"cd ./{intended_dest}", + f"pip3 install -r requirements.txt" + ] + combined_cmd = " && ".join(commands) + conn = Connection(host=str(ip), user=user) + rs = run(conn, combined_cmd, quiet) + conn.close() + if rs: + log(" installed successfully!", quiet) + return True + else: + log(" some error?", quiet) + return False -def read_ips(path: None | Path = None, - quiet: bool = False) -> list[IPv4Address | IPv6Address]: - if path: - ip_path = path - else: - ip_path = DEFAULT_IP_FILE - lines = ip_path.read_text().split("\n") - ips = [] - for a_line in lines: - try: - ips.append(ip_address(a_line.strip())) - except ValueError: - log(f"Ignoring line: {a_line}", quiet) - return ips +def setup_client_code(user: str, ip: IPv4Address | IPv6Address, + client_path: str, + quiet: bool = False) -> bool: + log(f"- attempting to setup project at {client_path}.", quiet) + conn = Connection(host=str(ip), user=user) + setup_cmd = activate_env_cmd_str(client_path) + setup_cmd += " && ./client-setup.py" + if quiet: + setup_cmd += " --quiet" + rs = run(conn, setup_cmd, quiet) + if not rs: + conn.close() + log("! but an error occurred!", quiet) + return False + return True