Skip to content

Commit

Permalink
complete dispatch setup
Browse files Browse the repository at this point in the history
  • Loading branch information
pes10k committed Jun 24, 2024
1 parent 7adfb37 commit efe2f07
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 63 deletions.
1 change: 1 addition & 0 deletions setup-client.py → client-setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
ERROR_DIR = CLIENT_DIR / "error"

DIRS_TO_WRITE = [
WORKSPACE_DIR,
CLIENT_DIR,
RECEIVED_DIR,
CRAWLING_START_DIR,
Expand Down
101 changes: 69 additions & 32 deletions dispatch.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,60 @@
#!/usr/bin/env python3

import argparse
import ipaddress
import multiprocessing
import pathlib
import sys

import pgcrawl.consts as PG_CONSTS
import pgcrawl.dispatch as PG_DISPATCH
from pgcrawl.logging import log


PARSER = argparse.ArgumentParser(
prog="pagegraph-tranco-crawl dispatcher",
description="Script responsible for dispatching and coordinating calls "
"to child SSH servers.",
"to child SSH server.\n"
"The full set of steps to take to set up a client from "
"scratch are: --test-connection, --install-client-code, "
"--check-client-code, --setup-client-code",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
PARSER.add_argument(
"-f", "--filepath",
default="./ips.txt",
help="Path to a file of IP addresses, describing the child servers to "
"use to crawl pages.")
"ip",
help="The IP address of the client server to interact with.")
PARSER.add_argument(
"--test-connections",
default=False,
action="store_true",
help="Test connections to child servers, but don't make any changes.")
"--test-connection",
default=False,
action="store_true",
help="Test connection to child server, but don't make any changes.")
PARSER.add_argument(
"--install-client-code",
default=False,
action="store_true",
help="Install this crawling code on child servers, and then quit.")
"--delete-client-code",
default=False,
action="store_true",
help="Install code at `--client-code-path` if present.")
PARSER.add_argument(
"--client-code-path",
default="~/pagegraph-tranco-crawl,
help="Relative path to where this code should be installed on each client.")
"--install-client-code",
default=False,
action="store_true",
help="Install this crawling code on child server.")
PARSER.add_argument(
"-u", "--user",
default="ubuntu",
help="The user to use when SSH'ing to a client server.")
"--check-client-code",
default=False,
action="store_true",
help="Check this crawling code is installed on child server, then quit.")
PARSER.add_argument(
"--setup-client-code",
default=False,
action="store_true",
help="Run the set up script for this code on the child server.")
PARSER.add_argument(
"--client-code-path",
default=PG_CONSTS.DEFAULT_CLIENT_CODE_PATH,
help="Path to where this code should be installed on the client.")
PARSER.add_argument(
"-u", "--user",
default="ubuntu",
help="The user to use when SSH'ing to a client server.")
PARSER.add_argument(
"--quiet", "-q",
default=False,
Expand All @@ -46,21 +64,39 @@
ARGS = PARSER.parse_args()
QUIET = ARGS.quiet
USER = ARGS.user
CLIENT_PATH = ARGS.client_code_path

IP = ipaddress.ip_address(ARGS.ip)

if ARGS.test_connection:
log(f"Checking connection to {IP}.")
if not PG_DISPATCH.test_connection(USER, IP, QUIET):
sys.exit(1)
sys.exit(0)

FILEPATH = pathlib.Path(ARGS.filepath)
if ARGS.delete_client_code:
log(f"Attempting to delete client code from {IP}:{CLIENT_PATH}.")
if not PG_DISPATCH.delete_client_code(USER, IP, CLIENT_PATH, QUIET):
sys.exit(1)
sys.exit(0)

IP_ADDRESSES = PG_DISPATCH.read_ips(FILEPATH)
if ARGS.test_connections:
log(f"Checking connections to {len(IP_ADDRESSES)} child servers")
for ip in IP_ADDRESSES:
if not PG_DISPATCH.test_connection(USER, ip, QUIET):
sys.exit(1)
sys.exit(0)
if ARGS.check_client_code:
log(f"Checking if client code is installed on {IP}:{CLIENT_PATH}.")
if not PG_DISPATCH.check_client_code(USER, IP, CLIENT_PATH, QUIET):
sys.exit(1)
sys.exit(0)

if ARGS.install_client:
CLIENT_PATH = ARGS.client_code_path
log(f"Installing on {len(IP_ADDRESSES)} servers, at {CLIENT_PATH}")
if ARGS.install_client_code:
log(f"Attempting to install client code on {IP}:{CLIENT_PATH}.")
if not PG_DISPATCH.install_client_code(USER, IP, CLIENT_PATH, QUIET):
sys.exit(1)
sys.exit(0)

if ARGS.setup_client_code:
log(f"Attempting to run setup / init code on {IP}:{CLIENT_PATH}.")
if not PG_DISPATCH.setup_client_code(USER, IP, CLIENT_PATH, QUIET):
sys.exit(1)
sys.exit(0)


# def name_to_parts(name):
Expand Down Expand Up @@ -90,11 +126,12 @@
# workload = itertools.islice(todo_files, index, index + chunk_size)
# parsed_workload = [name_to_parts(x.name) for x in workload]
# print(parsed_workload)
# commands = [f"python3 /home/ubuntu/crawl.py {domain} {rank}" for rank, domain in parsed_workload]
# commands = [f"python3 /home/ubuntu/crawl.py {domain} {rank}"
# for rank, domain in parsed_workload]
# ip_cmds = zip(instance_ips, commands)
# with multiprocessing.Pool(chunk_size) as p:
# p.map(run_cmd, ip_cmds)
# total += len(list(workload))
# index += chunk_size
# break
# print(total)
# print(total)
File renamed without changes.
1 change: 0 additions & 1 deletion ips.txt

This file was deleted.

2 changes: 1 addition & 1 deletion lint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

LOCAL_SCRIPTS="setup-client.py setup-dispatcher.py pgcrawl/*.py"
LOCAL_SCRIPTS="client-setup.py dispatcher-setup.py dispatch.py pgcrawl/*.py"

pycodestyle $LOCAL_SCRIPTS
mypy --strict $LOCAL_SCRIPTS
4 changes: 2 additions & 2 deletions pgcrawl/consts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib


GIT_URL = "https://github.com/brave-experiments/pagegraph-tranco-crawl"
GIT_URL = "git@github.com:brave-experiments/pagegraph-tranco-crawl.git"
PG_CRAWL_GIT_URL = "https://github.com/brave/pagegraph-crawl"
BRAVE_INSTALL_SCRIPT = pathlib.Path("./scripts/install-brave-nightly.sh")
DEFAULT_IP_FILE = pathlib.Path("./ips.txt")
DEFAULT_CLIENT_CODE_PATH = "~/pagegraph-tranco-crawl"
127 changes: 100 additions & 27 deletions pgcrawl/dispatch.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,122 @@
from io import StringIO
from ipaddress import ip_address, IPv4Address, IPv6Address
import json
from pathlib import Path

from fabric import Connection

from pgcrawl.consts import DEFAULT_IP_FILE
from pgcrawl.consts import GIT_URL
from pgcrawl.logging import log, error


def test_connection(user: str, ip: IPv4Address | IPv6Address,
quiet: bool = False) -> bool:
expected_user_dir = "/home/" + user
test_cmd = f"test -d {expected_user_dir}"
def run(conn: Connection, cmd: str, quiet: bool) -> bool:
log(f"* calling {conn.user}@{conn.host}: {cmd}", quiet)
try:
log(f"Testing existence of to {user}@{ip}:{expected_user_dir}")
rs = Connection(host=str(ip), user=user).run(test_cmd)
log(rs.stdout, quiet)
stdout_stream = StringIO()
stderr_stream = StringIO()
rs = conn.run(cmd, warn=True, hide=True, out_stream=stdout_stream,
err_stream=stderr_stream)
log(stdout_stream.getvalue(), quiet)
if rs.exited != 0:
error(rs.stderr)
error(stdout_stream.getvalue())
return False
else:
log("...connected successfully!")
return True
return True
except Exception as e:
error(str(e))
return False


def activate_env_cmd_str(client_path: str) -> str:
project_name = Path(client_path).name
commands = [
"cd ~/",
f"cd {client_path}",
". ./bin/activate",
f"cd {project_name}"
]
activate_env_cmd = " && ".join(commands)
return activate_env_cmd


def test_connection(user: str, ip: IPv4Address | IPv6Address,
quiet: bool = False) -> bool:
conn = Connection(host=str(ip), user=user)
expected_user_dir = "/home/" + user
test_cmd = f"test -d {expected_user_dir}"
if run(conn, test_cmd, quiet):
conn.close()
log(" connected successfully!", quiet)
return True
return False


def check_client_code(user: str, ip: IPv4Address | IPv6Address,
client_path: str,
quiet: bool = False) -> bool:
conn = Connection(host=str(ip), user=user)
check_install_cmd = f"test -d {client_path}"

rs = run(conn, check_install_cmd, quiet)
conn.close()
if rs:
log(" Looks already installed!", quiet)
return True
else:
log(" client code does not seem present", quiet)
return False


def delete_client_code(user: str, ip: IPv4Address | IPv6Address,
client_path: str,
quiet: bool = False) -> bool:
conn = Connection(host=str(ip), user=user)
delete_install = f"rm -Rf {client_path}"
rs = run(conn, delete_install, quiet)
conn.close()
if rs:
log(" installed deleted!", quiet)
return True
else:
log(" error when deleting", quiet)
return False


def install_client_code(user: str, ip: IPv4Address | IPv6Address,
client_path: str,
quiet: bool = False) -> bool:
return True
intended_dest = Path(client_path).name
commands = [
f"python3 -m venv {intended_dest}",
f"cd {intended_dest}",
f". ./bin/activate",
f"git clone {GIT_URL} {intended_dest}",
f"cd ./{intended_dest}",
f"pip3 install -r requirements.txt"
]
combined_cmd = " && ".join(commands)
conn = Connection(host=str(ip), user=user)
rs = run(conn, combined_cmd, quiet)
conn.close()
if rs:
log(" installed successfully!", quiet)
return True
else:
log(" some error?", quiet)
return False


def read_ips(path: None | Path = None,
quiet: bool = False) -> list[IPv4Address | IPv6Address]:
if path:
ip_path = path
else:
ip_path = DEFAULT_IP_FILE
lines = ip_path.read_text().split("\n")
ips = []
for a_line in lines:
try:
ips.append(ip_address(a_line.strip()))
except ValueError:
log(f"Ignoring line: {a_line}", quiet)
return ips
def setup_client_code(user: str, ip: IPv4Address | IPv6Address,
client_path: str,
quiet: bool = False) -> bool:
log(f"- attempting to setup project at {client_path}.", quiet)
conn = Connection(host=str(ip), user=user)
setup_cmd = activate_env_cmd_str(client_path)
setup_cmd += " && ./client-setup.py"
if quiet:
setup_cmd += " --quiet"
rs = run(conn, setup_cmd, quiet)
if not rs:
conn.close()
log("! but an error occurred!", quiet)
return False
return True

0 comments on commit efe2f07

Please sign in to comment.