diff --git a/.gitignore b/.gitignore index e2fcd9b..b6420f2 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,7 @@ cython_debug/ # Mac System .DS_Store + +app-*/ +spark-* +/column_name_change/ diff --git a/README.md b/README.md index 6d98db3..4b48f08 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,125 @@ # Distributed Downloader + MPI-based distributed downloading tool for retrieving data from diverse domains. ## Background -This MPI-based distributed downloader was initially designed for the purpose of downloading all images from the monthly [GBIF occurrence snapshot](https://www.gbif.org/occurrence-snapshots). The overall setup is general enough that it could be transformed into a functional tool beyond just our use; it should work on any list of URLs. We chose to build this tool instead of using something like [img2dataset](https://github.com/rom1504/img2dataset) to better avoid overloading source servers (GBIF documents approximately 200M images across 545 servers) and have more control over the final dataset construction and metadata management (e.g., using `HDF5` as discussed in [issue #1](https://github.com/Imageomics/distributed-downloader/issues/1)). - - -## How to Use - -`distributed-downloader` utilizes multiple nodes on a High Performance Computing (HPC) system (specifically, an HPC with `slurm` workload manager) to download a collection of images specified in a given tab-delimited text file. There are three manual steps to get the downloader running as designed; the first two function as a preprocessing step (to be done once with the initial file), and the third initiates the download (this step may be run multiple times for pre-established periods and each will pick up where the last left off). +This MPI-based distributed downloader was initially designed for the purpose of downloading all images from the +monthly [GBIF occurrence snapshot](https://www.gbif.org/occurrence-snapshots). The overall setup is general enough that +it could be transformed into a functional tool beyond just our use; it should work on any list of URLs. We chose to +build this tool instead of using something like [img2dataset](https://github.com/rom1504/img2dataset) to better avoid +overloading source servers (GBIF documents approximately 200M images across 545 servers) and have more control over the +final dataset construction and metadata management (e.g., using `HDF5` as discussed +in [issue #1](https://github.com/Imageomics/distributed-downloader/issues/1)). -1. The first step is to run the file through `src/server_prep.py`. This includes partitioning the dataset by server to generate batches of 10K URLs per server. Servers are determined by the URL in the input file. Additionally, it adds a UUID to each entry in the file to ensure preservation of provenance throughout the download and processing and beyond. This processing is still GBIF occurrence snapshot-specific in that it includes filters on the input file by media type (`StillImage`), checks for missing `gbifID`, and checks that the record indeed contains an image (through the `format` column). - -2. After the partitioning and filtering, `MPI_download_prep.py` must be run to establish the rate limits (by server) for the download. An "average" rate limit is established and then scaled based on the number of batches/simultaneous downloads per server (to avoid overloading a server while running simultaneous downloads). After the download is initialized, manual adjustments can be made based on results. Additionally, if a server returns any retry error (`429, 500, 501, 502, 503, 504`), the request rate for that server is reduced. - -3. Finally, `submitter.py` is run with the path to the `.env` file for various download settings and paths. This can be run for set periods of time and will restart where it has left off on the next run. Timing for batches is set in the `slurm` scripts passed through the `.env`. +## Installation Instructions -If you want the downloader to ignore some of the servers, you can add them to the `ignored_servers.csv` file. Then you need to rerun the `MPI_download_prep.py` script to update the schedules for the changes to take effect. +### Conda installation -### Running on other systems +1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) +2. Create a new conda environment: + ```commandline + conda env create -f environment.yaml --solver=libmamba -y + ``` -The parameters for step 3 can all be set in the configuration file. This includes information about your HPC account and paths to various files, as well as distribution of work and download settings; be sure to fill in your information. -The configuration file (`config/hpc.env`) should be in this location relative to the root of the directory from which these files are being run. +### Pip installation -Note that the current default is to download images such that the longest side is 720 pixels. The original and resized sizes are recorded in the metadata; the aspect ratio is preserved when resizing images. +1. Install Python 3.10 or higher +2. Install MPI, any MPI should work, tested with OpenMPI and IntelMPI. Installation instructions can be found on + official websites: + - [OpenMPI](https://docs.open-mpi.org/en/v5.0.x/installing-open-mpi/quickstart.html) + - [IntelMPI](https://www.intel.com/content/www/us/en/docs/mpi-library/developer-guide-linux/2021-6/installation.html) +3. Install required package: + - For general use: + ```commandline + pip install git+https://github.com/Imageomics/distributed-downloader + ``` + - For development: + ```commandline + pip install -e .[dev] + ``` -The provided `slurm` scripts for running steps 1 and 2 (`scripts/server_downloading_prep.slurm` and `scripts/server_profiling.slurm`) must have the account info changed at the top of their files (`#SBATCH --account=`). These are each only run once at the start of the project +## How to Use +`distributed-downloader` utilizes multiple nodes on a High Performance Computing (HPC) system (specifically, an HPC +with `slurm` workload manager) to download a collection of images specified in a given tab-delimited text file. -## Note on files +### Main script -`resize_mpi` (`py` and `slurm`) and `resizer_scheduler.py` are scripts intended to resize the images after download. For instance, in the case that the initial download size is set higher than intended, these can be used to adjust the size within the given structure and repackage it. They have not been generalized to fit in with the remaining package infrastructure and are simply extra tools that we used; they may be generalized in the future. +There are one manual step to get the downloader running as designed: +You need to call function `download_images` from package `distributed_downloader` with the `config_path` as an argument. +This will initialize filestructure in the output folder, partition the input file, profile the servers for their +possible download speed, and start downloading images. If downloading didn't finish, you can call the same function with +the same `config_path` argument to continue downloading. Downloader has two logging profiles: -- "INFO" - logs only the most important information, for example when a batch is started and finished. It also logs out any error that occurred during download, image decoding, or writing batch to the filesystem -- "DEBUG" - logs all information, for example logging start and finish of each downloaded image. -## Installation Instructions -1. Install Python 3.10 or higher -2. Install MPI, any MPI should work, tested with OpenMPI and IntelMPI. -3. Install Parallel HDF5, tested with version 1.12.2 -4. Install/Update pip, setuptools, and wheel - ``` - pip install -U wheel setuptools pip Cython - ``` -5. Install h5py: - ``` - export CC=/path/to/mpicc - export HDF5_MPI="ON" - export HDF5_DIR=/path/to/hdf5 - pip install --no-cache-dir --no-binary=h5py h5py - ``` -6. Install required packages: - ``` - pip install -r requirements.txt - ``` +- `INFO` - logs only the most important information, for example when a batch is started and finished. It also logs out + any error that occurred during download, image decoding, or writing batch to the filesystem +- `DEBUG` - logs all information, for example logging start and finish of each downloaded image. + +### Tools script + +After downloading is finished, you can use the `tools` package perform various operations on them. +To do this, you need to call the function `apply_tools` from package `distributed_downloader` with the `config_path` +and `tool_name` as an argument. +Following tools are available: + +- `resize` - resizes images to a new size +- `image_verification` - verifies images by checking if they are corrupted +- `duplication_based` - removes duplicate images +- `size_based` - removes images that are too small + +You can also add your own tool, the instructions are in the section below. + +### Creating a new tool + +You can also add your own tool by creating 3 classes and registering them with respective decorators. + +- Each tool's output will be saved in separate folder in `{config.output_structure.tools_folder}/{tool_name}` +- There are 3 steps in the tool pipeline: `filter`, `scheduler` and `runner`. + - `filter` - filters the images that should be processed by the tool and creates csv files with them + - `scheduler` - creates a schedule for processing the images for MPI + - `runner` - processes the images using MPI +- Each step should be implemented in a separate class. +- Tool name should be the same across all classes. +- Each tool should inherit from `ToolsBase` class. +- Each tool should have a `run` method that will be called by the main script. +- Each tool should be registered with a decorator from a respective package (`FilterRegister` from `filters` etc.) + +## Rules for scripts: + +All scripts can expect to have the following custom environment variables, specific variables are only initialized +when respective tool is called: + +- General parameters + - `CONFIG_PATH` + - `ACCOUNT` + - `PATH_TO_INPUT` + - `PATH_TO_OUTPUT` + - `OUTPUT_URLS_FOLDER` + - `OUTPUT_LOGS_FOLDER` + - `OUTPUT_IMAGES_FOLDER` + - `OUTPUT_SCHEDULES_FOLDER` + - `OUTPUT_PROFILES_TABLE` + - `OUTPUT_IGNORED_TABLE` + - `OUTPUT_INNER_CHECKPOINT_FILE` + - `OUTPUT_TOOLS_FOLDER` +- Specific for downloader + - `DOWNLOADER_NUM_DOWNLOADS` + - `DOWNLOADER_MAX_NODES` + - `DOWNLOADER_WORKERS_PER_NODE` + - `DOWNLOADER_CPU_PER_WORKER` + - `DOWNLOADER_HEADER` + - `DOWNLOADER_IMAGE_SIZE` + - `DOWNLOADER_LOGGER_LEVEL` + - `DOWNLOADER_BATCH_SIZE` + - `DOWNLOADER_RATE_MULTIPLIER` + - `DOWNLOADER_DEFAULT_RATE_LIMIT` +- Specific for tools + - `TOOLS_NUM_WORKERS` + - `TOOLS_MAX_NODES` + - `TOOLS_WORKERS_PER_NODE` + - `TOOLS_CPU_PER_WORKER` + - `TOOLS_THRESHOLD_SIZE` + - `TOOLS_NEW_RESIZE_SIZE` diff --git a/config/example_config.yaml b/config/example_config.yaml new file mode 100644 index 0000000..d5ba7ca --- /dev/null +++ b/config/example_config.yaml @@ -0,0 +1,64 @@ +account: "account_name" # Account name for the cluster +path_to_input: "path_to_input_file" # Path to the input file with the list of servers +path_to_output_folder: "path_to_output_folder" # Path to the output folder + +scripts: + # Wrapper scripts to submit jobs to the cluster + general_submitter: "path_to_general_submitter_script.sh" + tools_submitter: "path_to_tools_submitter_script.sh" + mpi_submitter: "path_to_mpi_submitter_script.sh" + schedule_creator_submitter: "path_to_schedule_creator_submitter_script.sh" + # Cluster job's scripts + initialization_script: "path_to_initialization_script.slurm" + profiling_script: "path_to_profiling_script.slurm" + schedule_creation_script: "path_to_schedule_creation_script.slurm" + verify_script: "path_to_verify_script.slurm" + download_script: "path_to_download_script.slurm" + # tools scripts + tools_filter_script: "path_to_tools_filter_script.slurm" + tools_scheduling_script: "path_to_tools_scheduling_script.slurm" + tools_worker_script: "path_to_tools_worker_script.slurm" + tools_verification_script: "path_to_tools_verification_script.slurm" + +# Rules for the schedule creation +# They determine how many simultaneous downloader instances can be run on the same server +# Rules are based on the number of batches required to be downloaded from the server +# Rule is: key - number of batches, value - number of instances; if server has more than key batches, value instances can be run +# Server with 0 batches is considered to be downloaded and are ignored +# Default value is 1 +# Order of the rules does not matter +schedule_rules: + 1: 1 + +# Structure of the output folder that will be created automatically +output_structure: + urls_folder: "servers_batched" # Folder where the servers will be split into batches + logs_folder: "logs" # Folder for the logs + images_folder: "downloaded_images" # Folder for the downloaded images + schedules_folder: "schedules" # Folder for the schedules + profiles_table: "servers_profiles.csv" # Table with the servers profiles + ignored_table: "ignored_servers.csv" # Table with the servers that were ignored, you can find an example in examples/ignored_servers.csv + inner_checkpoint_file: "inner_checkpoint.yaml" # Inner checkpoint file + tools_folder: "tools" # Folder for the tools + +# Parameters for the downloader +suppress_unchanged_error: False # Suppress the error if two consecutive downloads do not change the number of batches completed +downloader_parameters: + num_downloads: 0 # Number of downloads to be performed + max_nodes: 0 # Maximum number of nodes to be used + workers_per_node: 0 # Number of workers per node + cpu_per_worker: 0 # Number of CPUs per worker + header: "" # Header for the requests + image_size: 0 # Size of the image to be downloaded + logger_level: "INFO" # Logger level + batch_size: 0 # Batch size, default is 10000 + rate_multiplier: 1 # Rate multiplier for the rate limit + default_rate_limit: 10 # Default rate limit for the profiler + +tools_parameters: + num_workers: 0 + max_nodes: 0 + workers_per_node: 0 + cpu_per_worker: 0 + threshold_size: 224 # Threshold size for the images, images with size less than this value will filtered out + new_resize_size: 720 # New size for the images in resize tool diff --git a/config/hpc.env b/config/hpc.env deleted file mode 100644 index b3bc9c7..0000000 --- a/config/hpc.env +++ /dev/null @@ -1,37 +0,0 @@ -# If you use a different High Performance Computing (HPC) system, update the environment variables below accordingly. - -# root project information -ACCOUNT=pas2136 -GBIF_CACHE_ROOT=/fs/scratch/PAS2136/gbif/data -PROCESSED_DATA_ROOT_LOCAL=/users/PAS2119/andreykopanev/gbif/processed -PROCESSED_DATA_ROOT=/fs/scratch/PAS2136/gbif/processed -TIME_STAMP=2024-05-01 - -# slurm and python-slurm coordination scripts -MPI_SUBMITTER_SCRIPT=/users/PAS2119/andreykopanev/distributed-downloader/scripts/submit_mpi_download.sh -DOWNLOADING_SCRIPT=/users/PAS2119/andreykopanev/distributed-downloader/scripts/server_downloading.slurm -VERIFYING_SCRIPT=/users/PAS2119/andreykopanev/distributed-downloader/scripts/server_verifying.slurm - -# download and log locations -DOWNLOAD_DIR=dataset -DOWNLOADER_URLS_FOLDER=servers_batched -DOWNLOADER_LOGS_FOLDER=logs -DOWNLOADER_IMAGES_FOLDER=downloaded_images -DOWNLOADER_SCHEDULES_FOLDER=schedules -DOWNLOADER_PROFILES_PATH=servers_profiles.csv -DOWNLOADER_IGNORED_PATH=ignored_servers.csv - -# distribution of work settings -DOWNLOADER_MAX_NODES=7 -DOWNLOADER_WORKERS_PER_NODE=6 -DOWNLOADER_CPU_PER_TASK=6 - -# download settings -# make sure to leave the space in the HEADER string after the colon (": ") -HEADER="User-Agent: Imageomics Institute (https://imageomics.org; imageomics-it@osu.edu)" - -# IMAGE_SIZE is the max side-length of an image in pixels -IMAGE_SIZE=720 - -# LOGGER_LEVEL is the logging level for the downloader -LOGGER_LEVEL=INFO diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..e9c9913 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,53 @@ +name: distributed-downloader +channels: + - conda-forge + - defaults +dependencies: + - openmpi + - python + - uv + - opencv + - pyspark + - attrs + - brotli + - certifi + - charset-normalizer + - cramjam + - cython + - exceptiongroup + - fsspec + - hatchling + - idna + - inflate64 + - iniconfig + - mpi4py + - multivolumefile + - numpy + - packaging + - pandas + - pathspec + - pillow + - pip + - pluggy + - psutil + - py4j + - pyarrow + - pybcj + - pycryptodomex + - pyppmd + - pytest + - python-dateutil + - python-dotenv + - pytz + - pyyaml + - pyzstd + - requests + - setuptools + - six + - texttable + - tomli + - trove-classifiers + - typing-extensions + - tzdata + - urllib3 + - wheel diff --git a/examples/ignored_servers.csv b/examples/ignored_servers.csv new file mode 100644 index 0000000..fa9b8d3 --- /dev/null +++ b/examples/ignored_servers.csv @@ -0,0 +1,3 @@ +ServerName +server_name_1 +server_name_2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b56cad2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/distributed_downloader"] + +[project] +name = "distributed_downloader" +dynamic = ["dependencies", "version"] +authors = [ + { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" }, + { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" }, + { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" }, +] +description = "A tool for downloading files from a list of URLs in parallel." +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[project.optional-dependencies] +dev = ["pytest"] + +keywords = [ + "parallel", + "distributed", + "download", + "url", +] + +[project.urls] +Homepage = "https://github.com/Imageomics/distributed-downloader" +Repository = "https://github.com/Imageomics/distributed-downloader.git" +"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues" + +[project.scripts] +distributed_downloader = "distributed_downloader.main:main" +distributed_downloader_tools = "distributed_downloader.tools:main" + +[tool.hatch.version] +path = "src/distributed_downloader/core/__about__.py" diff --git a/requirements.txt b/requirements.txt index 7fab27d..5c32703 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,30 +1,85 @@ -Brotli==1.1.0 -certifi==2024.6.2 +# This file was autogenerated by uv via the following command: +# uv pip compile - -o requirements.txt +attrs==24.1.0 +brotli==1.1.0 +certifi==2024.7.4 + # via + # requests charset-normalizer==3.3.2 -Cython==3.0.10 -h5py==3.11.0 -hdf5plugin==4.4.0 + # via + # requests +cramjam==2.8.3 +cython==3.0.11 +exceptiongroup==1.2.2 + # via pytest +fsspec==2024.6.1 +hatchling==1.25.0 idna==3.7 + # via + # requests inflate64==1.0.0 -mpi4py==3.1.6 +iniconfig==2.0.0 + # via pytest +mpi4py==4.0.0 multivolumefile==0.2.3 -numpy==1.26.4 -opencv-python==4.10.0.82 +numpy==2.0.1 + # via + # opencv-python + # pandas + # pyarrow +opencv-python==4.10.0.84 +packaging==24.1 + # via + # hatchling + # pytest pandas==2.2.2 -pillow==10.3.0 -psutil==5.9.8 +pathspec==0.12.1 + # via + # hatchling +pillow==10.4.0 +pip==24.2 +pluggy==1.5.0 + # via + # hatchling + # pytest +psutil==6.0.0 py4j==0.10.9.7 -py7zr==0.21.0 + # via + # pyspark +pyarrow==17.0.0 pybcj==1.0.2 pycryptodomex==3.20.0 pyppmd==1.1.0 pyspark==3.5.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 + # via + # pandas python-dotenv==1.0.1 pytz==2024.1 -pyzstd==0.16.0 + # via + # pandas +pyyaml==6.0.1 +pyzstd==0.16.1 requests==2.32.3 +setuptools==72.1.0 six==1.16.0 + # via + # python-dateutil texttable==1.7.0 +tomli==2.0.1 + # via + # hatchling + # pytest +trove-classifiers==2024.7.2 + # via + # hatchling +typing-extensions==4.12.2 tzdata==2024.1 + # via + # pandas urllib3==2.2.2 + # via + # requests +wheel==0.44.0 +hatch-requirements-txt==0.4.1 diff --git a/scripts/general_submit.sh b/scripts/general_submit.sh new file mode 100755 index 0000000..6e27bc2 --- /dev/null +++ b/scripts/general_submit.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +SCRIPTS_DIR=$(dirname "$(realpath "$0")") +REPO_ROOT=$(dirname "$(realpath "${SCRIPTS_DIR}")") +export REPO_ROOT +logs_dir="${OUTPUT_LOGS_FOLDER}" +mkdir -p "${logs_dir}" + +# Check if any arguments were passed +if [ "$#" -eq 0 ]; then + echo "Usage: $0 script1 [dependencies...]" + exit 1 +fi + +script=$1 +if [ ! -f "$script" ]; then + echo "Error: File '$script' not found" +fi + +filename=$(basename "$script") +ext="${filename##*.}" +base_filename=$(basename "${filename}" ."${ext}") +dependencies=$(IFS=,; echo "${*:2}") + +# Submit the script to Slurm +if [ -z "${dependencies}" ]; then + sbatch \ + --output="${logs_dir}/${base_filename}.out" \ + --error="${logs_dir}/${base_filename}.err" \ + --account="${ACCOUNT}" \ + "${script}" + exit 0 +fi + +sbatch \ + --output="${logs_dir}/${base_filename}.out" \ + --error="${logs_dir}/${base_filename}.err" \ + --dependency=afterok:"${dependencies}" \ + --account="${ACCOUNT}" \ + "${script}" diff --git a/scripts/initialization.slurm b/scripts/initialization.slurm new file mode 100644 index 0000000..f416f1d --- /dev/null +++ b/scripts/initialization.slurm @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH --job-name initialization +#SBATCH --nodes=4 +#SBATCH --time=00:30:00 + +set -e + +driver_memory="110G" +executor_memory="64G" + +module load spark/3.4.1 + +pbs-spark-submit \ + --driver-memory $driver_memory \ + --executor-memory $executor_memory \ + "${REPO_ROOT}/src/distributed_downloader/core/initialization.py" \ + > "${OUTPUT_LOGS_FOLDER}/initialization.log" diff --git a/scripts/profiling.slurm b/scripts/profiling.slurm new file mode 100644 index 0000000..23be541 --- /dev/null +++ b/scripts/profiling.slurm @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name profiling +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=3 +#SBATCH --time=00:05:00 + +set -e + +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYARROW_IGNORE_TIMEZONE=1 + +python \ + "${REPO_ROOT}/src/distributed_downloader/core/fake_profiler.py" \ + > "${OUTPUT_LOGS_FOLDER}/profiler.log" diff --git a/scripts/resize_mpi.slurm b/scripts/resize_mpi.slurm deleted file mode 100644 index e181aa4..0000000 --- a/scripts/resize_mpi.slurm +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -#SBATCH --job-name resize_mpi -#SBATCH --nodes=10 -#SBATCH --ntasks-per-node=3 -#SBATCH --cpus-per-task=13 -#SBATCH --time=02:00:00 -#SBATCH --mem=0 -#SBATCH --account=pas2136 - -source "${REPO_ROOT}/config/hpc.env" - -module load intel/2021.10.0 -module load intelmpi/2021.10 -module load miniconda3/23.3.1-py310 -source "${REPO_ROOT}/.venv/bin/activate" -export PYARROW_IGNORE_TIMEZONE=1 - -export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 - - -srun \ - --mpi=pmi2 \ - --nodes=10 \ - --ntasks-per-node=3 \ - --cpus-per-task=13 \ - --mem=0 \ - --output="${REPO_ROOT}/logs/resize/resize-%2t.log" \ - python \ - "${REPO_ROOT}/src/resize_mpi.py" diff --git a/scripts/schedule_creation.slurm b/scripts/schedule_creation.slurm new file mode 100644 index 0000000..7af2c39 --- /dev/null +++ b/scripts/schedule_creation.slurm @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name schedule_creation +#SBATCH --time=00:05:00 + +set -e + +logs_dir="${OUTPUT_LOGS_FOLDER}" +mkdir -p "${logs_dir}" + +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYARROW_IGNORE_TIMEZONE=1 + +python \ + "${REPO_ROOT}/src/distributed_downloader/core/MPI_download_prep.py" \ + > "${logs_dir}/MPI_downloader_prep.log" + +mv "${logs_dir}/${LOGS_BASE_FILENAME}.out" "${logs_dir}/current" +mv "${logs_dir}/${LOGS_BASE_FILENAME}.err" "${logs_dir}/current" +mv "${logs_dir}/MPI_downloader_prep.log" "${logs_dir}/current" diff --git a/scripts/server_downloading.slurm b/scripts/server_downloading.slurm index e656169..86742bc 100644 --- a/scripts/server_downloading.slurm +++ b/scripts/server_downloading.slurm @@ -1,9 +1,7 @@ #!/bin/bash #SBATCH --job-name mpi_downloader #SBATCH --mem=0 -#SBATCH --time=02:00:00 - -# ============================ PARAMETERS ====================================== +#SBATCH --time=03:00:00 if [ "$#" -eq 0 ]; then echo "Usage: $0 schedule# iteration_number" @@ -13,13 +11,8 @@ fi schedule=$1 iteration_number=$2 -# ================== DO NOT TOUCH BELOW THIS LINE ============================== -source "${REPO_ROOT}/config/hpc.env" - -input_path="${PROCESSED_DATA_ROOT}/${TIME_STAMP}/${DOWNLOAD_DIR}" - -schedule_path="${input_path}/${DOWNLOADER_SCHEDULES_FOLDER}/${schedule}" -logs_dir="${input_path}/${DOWNLOADER_LOGS_FOLDER}/${schedule}/${iteration_number}" +schedule_path="${OUTPUT_SCHEDULES_FOLDER}/current/${schedule}" +logs_dir="${OUTPUT_LOGS_FOLDER}/current/${schedule}/${iteration_number}" mkdir -p "${logs_dir}" module load intel/2021.10.0 @@ -27,30 +20,38 @@ module load intelmpi/2021.10 module load miniconda3/23.3.1-py310 source "${REPO_ROOT}/.venv/bin/activate" export PYARROW_IGNORE_TIMEZONE=1 - export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 # memory limit per node: 177G + { - srun --mpi=pmi2 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --output="${logs_dir}/MPI_multimedia_downloader.log" \ + srun \ + --mpi=pmi2 \ + --nodes=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=1 \ + --output="${logs_dir}/MPI_downloader_verifier.log" \ + python \ + "${REPO_ROOT}/src/distributed_downloader/core/MPI_downloader_verifier.py" \ + "$schedule_path" +} && { + srun \ + --mpi=pmi2 \ + --nodes=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=1 \ + --output="${logs_dir}/MPI_multimedia_downloader.log" \ python \ - "${REPO_ROOT}/src/MPI_multimedia_downloader_controller.py" \ - "$input_path" \ - "$schedule_path" \ - "$DOWNLOADER_MAX_NODES" \ - "$DOWNLOADER_WORKERS_PER_NODE" + "${REPO_ROOT}/src/distributed_downloader/core/MPI_multimedia_downloader_controller.py" \ + "$schedule_path" } && { srun \ --mpi=pmi2 \ --nodes="$DOWNLOADER_MAX_NODES" \ --ntasks-per-node="$DOWNLOADER_WORKERS_PER_NODE" \ - --cpus-per-task="$DOWNLOADER_CPU_PER_TASK" \ + --cpus-per-task="$DOWNLOADER_CPU_PER_WORKER" \ --mem=0 \ --output="${logs_dir}/MPI_multimedia_downloader-%2t.log" \ - python "${REPO_ROOT}/src/MPI_multimedia_downloader.py" \ - "$input_path" \ - "$schedule_path" \ - --header "$HEADER" \ - --img-size "$IMAGE_SIZE" \ - --logging-level "$LOGGER_LEVEL" + python "${REPO_ROOT}/src/distributed_downloader/core/MPI_multimedia_downloader.py" \ + "$schedule_path" } diff --git a/scripts/server_downloading_prep.slurm b/scripts/server_downloading_prep.slurm deleted file mode 100644 index e69cfa3..0000000 --- a/scripts/server_downloading_prep.slurm +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -#SBATCH --job-name download_prep -#SBATCH --time=00:05:00 -#SBATCH --account=pas2136 - -source "${REPO_ROOT}/config/hpc.env" - -input_path="${PROCESSED_DATA_ROOT}/${TIME_STAMP}/${DOWNLOAD_DIR}" - -logs_dir="${input_path}/${DOWNLOADER_LOGS_FOLDER}" -mkdir -p "${logs_dir}" - -module load intel/2021.10.0 -module load intelmpi/2021.10 -module load miniconda3/23.3.1-py310 -source "${REPO_ROOT}/.venv/bin/activate" -export PYARROW_IGNORE_TIMEZONE=1 - -# memory limit per node: 177G -srun \ - --mpi=pmi2 \ - --nodes=1 \ - --ntasks-per-node=1 \ - --cpus-per-task=1 \ - --output="${logs_dir}/MPI_downloader_prep.log" \ - python \ - "${REPO_ROOT}/src/MPI_download_prep.py" \ - "$input_path" \ - "$DOWNLOADER_MAX_NODES" \ - "$DOWNLOADER_WORKERS_PER_NODE" diff --git a/scripts/server_profiling.slurm b/scripts/server_profiling.slurm deleted file mode 100644 index bf37d05..0000000 --- a/scripts/server_profiling.slurm +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -#SBATCH --job-name test -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=3 -#SBATCH --time=00:05:00 -#SBATCH --account=pas2136 - -# ============================ PARAMETERS ====================================== -set -e - -batch_size=10000 -max_nodes=10 -max_worker_per_node=4 - -# ================== DO NOT TOUCH BELOW THIS LINE ============================== -source "${REPO_ROOT}/config/hpc.env" - -input_path="${PROCESSED_DATA_ROOT}/${TIME_STAMP}/${DOWNLOAD_DIR}" - -module load intel/2021.10.0 -module load intelmpi/2021.10 -module load hdf5/1.12.2 -module load miniconda3/23.3.1-py310 -source "${REPO_ROOT}/.venv/bin/activate" -export PYARROW_IGNORE_TIMEZONE=1 - -# export I_MPI_DEBUG=100 -# unset I_MPI_PMI_LIBRARY -# unset I_MPI_HYDRA_BOOTSTRAP -export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 - -# memory limit per node: 177G -srun --mpi=pmi2 --nodes=1 --ntasks-per-node=1 --cpus-per-task=3 \ - python \ - "${REPO_ROOT}/src/MPI_server_profiler_controller.py" \ - "$input_path" \ - "$max_nodes" \ - "$max_worker_per_node" \ - "$batch_size" \ - > "${input_path}/${DOWNLOADER_LOGS_FOLDER}/MPI_server_profiler_controller.log" - -# srun \ -# --mpi=pmi2 \ -# --nodes=$max_nodes \ -# --ntasks-per-node=$max_worker_per_node \ -# --cpus-per-task=10 \ -# --output="${input_path}/${DOWNLOADER_LOGS_FOLDER}/MPI_server_profiler-%2t.log" \ -# python src/MPI_server_profiler.py \ -# "$input_path" \ -# "$batch_size" \ -# --header "$HEADER" \ -# --img-size "$IMAGE_SIZE" diff --git a/scripts/server_verifying.slurm b/scripts/server_verifying.slurm index 5006a79..d09e178 100644 --- a/scripts/server_verifying.slurm +++ b/scripts/server_verifying.slurm @@ -9,14 +9,9 @@ fi schedule=$1 iteration_number=$2 -recheck_flag=$3 -source "${REPO_ROOT}/config/hpc.env" - -input_path="${PROCESSED_DATA_ROOT}/${TIME_STAMP}/${DOWNLOAD_DIR}" - -schedule_path="${input_path}/${DOWNLOADER_SCHEDULES_FOLDER}/${schedule}" -logs_dir="${input_path}/${DOWNLOADER_LOGS_FOLDER}/${schedule}/${iteration_number}" +schedule_path="${OUTPUT_SCHEDULES_FOLDER}/current/${schedule}" +logs_dir="${OUTPUT_LOGS_FOLDER}/current/${schedule}/${iteration_number}" mkdir -p "${logs_dir}" module load intel/2021.10.0 @@ -24,31 +19,14 @@ module load intelmpi/2021.10 module load miniconda3/23.3.1-py310 source "${REPO_ROOT}/.venv/bin/activate" export PYARROW_IGNORE_TIMEZONE=1 - export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 -if [ "$recheck_flag" = "--recheck" ]; then -# memory limit per node: 177G - srun \ - --mpi=pmi2 \ - --nodes=1 \ - --ntasks-per-node=1 \ - --cpus-per-task=1 \ - --output="${logs_dir}/MPI_downloader_verifier.log" \ - python \ - "${REPO_ROOT}/src/MPI_downloader_verifier.py" \ - "$input_path" \ - "$schedule_path" \ - "$recheck_flag" -else - srun \ - --mpi=pmi2 \ - --nodes=1 \ - --ntasks-per-node=1 \ - --cpus-per-task=1 \ - --output="${logs_dir}/MPI_downloader_verifier.log" \ - python \ - "${REPO_ROOT}/src/MPI_downloader_verifier.py" \ - "$input_path" \ - "$schedule_path" -fi +srun \ + --mpi=pmi2 \ + --nodes=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=1 \ + --output="${logs_dir}/MPI_downloader_verifier.log" \ + python \ + "${REPO_ROOT}/src/distributed_downloader/core/MPI_downloader_verifier.py" \ + "$schedule_path" diff --git a/scripts/submit_mpi_download.sh b/scripts/submit_mpi_download.sh index 4b3159f..091f15e 100755 --- a/scripts/submit_mpi_download.sh +++ b/scripts/submit_mpi_download.sh @@ -2,7 +2,6 @@ SCRIPTS_DIR=$(dirname "$(realpath "$0")") REPO_ROOT=$(dirname "$(realpath "${SCRIPTS_DIR}")") -source "${REPO_ROOT}/config/hpc.env" export REPO_ROOT if [ "$#" -eq 0 ]; then @@ -14,14 +13,15 @@ script=$1 schedule=$2 iteration_number=$3 -logs_dir="${REPO_ROOT}/${DOWNLOADER_LOGS_FOLDER}/${schedule}/${iteration_number}" +logs_dir="${OUTPUT_LOGS_FOLDER}/current/${schedule}/${iteration_number}" mkdir -p "${logs_dir}" +filename=$(basename "$script") +ext="${filename##*.}" +base_filename=$(basename "${filename}" ."${ext}") + if [ "$4" != "" ] && [ "$4" != "--recheck" ]; then dependency=$4 - filename=$(basename "$script") - ext="${filename##*.}" - base_filename=$(basename "${filename}" ."${ext}") # Submit the script to Slurm sbatch \ @@ -30,22 +30,18 @@ if [ "$4" != "" ] && [ "$4" != "--recheck" ]; then --dependency=afterany:"${dependency}" \ --nodes="${DOWNLOADER_MAX_NODES}" \ --ntasks-per-node="${DOWNLOADER_WORKERS_PER_NODE}" \ - --cpus-per-task="${DOWNLOADER_CPU_PER_TASK}" \ + --cpus-per-task="${DOWNLOADER_CPU_PER_WORKER}" \ --account="${ACCOUNT}" \ "${script}" "${schedule}" "${iteration_number}" "$5" exit 0 else - filename=$(basename "$script") - ext="${filename##*.}" - base_filename=$(basename "${filename}" ."${ext}") - # Submit the script to Slurm sbatch \ --output="${logs_dir}/${base_filename}.out" \ --error="${logs_dir}/${base_filename}.err" \ --nodes="${DOWNLOADER_MAX_NODES}" \ --ntasks-per-node="${DOWNLOADER_WORKERS_PER_NODE}" \ - --cpus-per-task="${DOWNLOADER_CPU_PER_TASK}" \ + --cpus-per-task="${DOWNLOADER_CPU_PER_WORKER}" \ --account="${ACCOUNT}" \ "${script}" "${schedule}" "${iteration_number}" "$4" exit 0 diff --git a/scripts/submit_schedule_creator.sh b/scripts/submit_schedule_creator.sh new file mode 100755 index 0000000..5fd09b5 --- /dev/null +++ b/scripts/submit_schedule_creator.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -e + +SCRIPTS_DIR=$(dirname "$(realpath "$0")") +REPO_ROOT=$(dirname "$(realpath "${SCRIPTS_DIR}")") +export REPO_ROOT +logs_dir="${OUTPUT_LOGS_FOLDER}" +mkdir -p "${logs_dir}" + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 script1 [dependencies...]" + exit 1 +fi + +script=$1 +if [ ! -f "$script" ]; then + echo "Error: File '$script' not found" +fi + +filename=$(basename "$script") +ext="${filename##*.}" +base_filename=$(basename "${filename}" ."${ext}") +export LOGS_BASE_FILENAME=$base_filename +dependencies=$(IFS=,; echo "${*:2}") + +# Submit the script to Slurm +if [ -z "${dependencies}" ]; then + sbatch \ + --output="${logs_dir}/${base_filename}.out" \ + --error="${logs_dir}/${base_filename}.err" \ + --nodes="${DOWNLOADER_MAX_NODES}" \ + --ntasks-per-node="${DOWNLOADER_WORKERS_PER_NODE}" \ + --cpus-per-task="${DOWNLOADER_CPU_PER_WORKER}" \ + --account="${ACCOUNT}" \ + "${script}" + exit 0 +fi + +sbatch \ + --output="${logs_dir}/${base_filename}.out" \ + --error="${logs_dir}/${base_filename}.err" \ + --dependency=afterany:"${dependencies}" \ + --nodes="${DOWNLOADER_MAX_NODES}" \ + --ntasks-per-node="${DOWNLOADER_WORKERS_PER_NODE}" \ + --cpus-per-task="${DOWNLOADER_CPU_PER_WORKER}" \ + --account="${ACCOUNT}" \ + "${script}" diff --git a/scripts/submit_slurm.sh b/scripts/submit_slurm.sh deleted file mode 100755 index 59d64b5..0000000 --- a/scripts/submit_slurm.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -SCRIPTS_DIR=$(dirname "$(realpath "$0")") -REPO_ROOT=$(dirname "$(realpath "${SCRIPTS_DIR}")") -source "${REPO_ROOT}/config/hpc.env" -export REPO_ROOT -logs_dir="${REPO_ROOT}/${DOWNLOADER_LOGS_FOLDER}" -mkdir -p "${logs_dir}" - -# Check if any arguments were passed -if [ "$#" -eq 0 ]; then - echo "Usage: $0 script1 [script2 script3 ...]" - exit 1 -fi - -# Loop over all arguments -for script in "$@"; do - # Check if the file exists - if [ ! -f "$script" ]; then - echo "Error: File '$script' not found" - continue - fi - filename=$(basename "$script") - ext="${filename##*.}" - base_filename=$(basename "${filename}" ."${ext}") - - # Submit the script to Slurm - sbatch --output="${logs_dir}/${base_filename}.out" --error="${logs_dir}/${base_filename}.err" "${script}" -done diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm new file mode 100644 index 0000000..a3be2c1 --- /dev/null +++ b/scripts/tools_filter.slurm @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name tool_filter +#SBATCH --mem=0 + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 tool_name" + exit 1 +fi + +tool_name=$1 + +logs_dir="${OUTPUT_TOOLS_LOGS_FOLDER}" +mkdir -p "$logs_dir" + +driver_memory="110G" +executor_memory="64G" + +module load spark/3.4.1 +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader" + +pbs-spark-submit \ + --driver-memory $driver_memory \ + --executor-memory $executor_memory \ + "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \ + "${tool_name}" \ + > "${logs_dir}/tool_filter.log" diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm new file mode 100644 index 0000000..e4fb6a2 --- /dev/null +++ b/scripts/tools_scheduler.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name tool_scheduler +#SBATCH --mem=0 +#SBATCH --time=00:05:00 + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 tool_name" + exit 1 +fi + +tool_name=$1 + +logs_dir="${OUTPUT_TOOLS_LOGS_FOLDER}" +mkdir -p "$logs_dir" + +module load intel/2021.10.0 +module load intelmpi/2021.10 +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYARROW_IGNORE_TIMEZONE=1 +export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 +export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader" + +srun \ + --mpi=pmi2 \ + --nodes=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=1 \ + --mem=0 \ + --output="${logs_dir}/tool_scheduler.log" \ + python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}" diff --git a/scripts/tools_submit.sh b/scripts/tools_submit.sh new file mode 100755 index 0000000..adc3d99 --- /dev/null +++ b/scripts/tools_submit.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -e + +SCRIPTS_DIR=$(dirname "$(realpath "$0")") +REPO_ROOT=$(dirname "$(realpath "${SCRIPTS_DIR}")") +export REPO_ROOT +logs_dir="${OUTPUT_TOOLS_LOGS_FOLDER}" +mkdir -p "${logs_dir}" + +# Check if any arguments were passed +if [ "$#" -eq 0 ]; then + echo "Usage: $0 script1 tool_name [dependency] [--spark]" + exit 1 +fi + +script=$1 +if [ ! -f "$script" ]; then + echo "Error: File '$script' not found" + exit 1 +fi + +filename=$(basename "$script") +ext="${filename##*.}" +base_filename=$(basename "${filename}" ."${ext}") +tool_name=$2 +dependency="" +spark_flag="" + +if [ "$3" == "--spark" ]; then + spark_flag="--spark" + dependency="$4" +else + dependency="$3" + if [ "$4" == "--spark" ]; then + spark_flag="--spark" + fi +fi + +sbatch_cmd="sbatch --output=\"${logs_dir}/${base_filename}.out\" --error=\"${logs_dir}/${base_filename}.err\" --nodes=${TOOLS_MAX_NODES}" + +if [ -n "$dependency" ]; then + sbatch_cmd+=" --dependency=afterany:${dependency}" +fi + +if [ -z "$spark_flag" ]; then + sbatch_cmd+=" --ntasks-per-node=${TOOLS_WORKERS_PER_NODE} --cpus-per-task=${TOOLS_CPU_PER_WORKER}" +fi + +sbatch_cmd+=" --account=${ACCOUNT} ${script} ${tool_name}" + +eval "$sbatch_cmd" diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm new file mode 100644 index 0000000..98ca024 --- /dev/null +++ b/scripts/tools_verifier.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name tool_verifier +#SBATCH --mem=0 +#SBATCH --time=00:05:00 + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 tool_name" + exit 1 +fi + +tool_name=$1 + +logs_dir="${OUTPUT_TOOLS_LOGS_FOLDER}" +mkdir -p "$logs_dir" + +module load intel/2021.10.0 +module load intelmpi/2021.10 +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYARROW_IGNORE_TIMEZONE=1 +export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 +export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader" + +srun \ + --mpi=pmi2 \ + --nodes=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=1 \ + --mem=0 \ + --output="${logs_dir}/tool_verifier.log" \ + python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}" diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm new file mode 100644 index 0000000..2ee2662 --- /dev/null +++ b/scripts/tools_worker.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name tool_worker +#SBATCH --mem=0 +#SBATCH --time=03:00:00 + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 tool_name" + exit 1 +fi + +tool_name=$1 + +logs_dir="${OUTPUT_TOOLS_LOGS_FOLDER}" +mkdir -p "$logs_dir" + +module load intel/2021.10.0 +module load intelmpi/2021.10 +module load miniconda3/23.3.1-py310 +source "${REPO_ROOT}/.venv/bin/activate" +export PYARROW_IGNORE_TIMEZONE=1 +export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0 +export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader" + +srun \ + --mpi=pmi2 \ + --nodes="$TOOLS_MAX_NODES" \ + --ntasks-per-node="$TOOLS_WORKERS_PER_NODE" \ + --cpus-per-task="$TOOLS_CPU_PER_WORKER" \ + --mem=0 \ + --output="${logs_dir}/tool_worker-%2t.log" \ + python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}" diff --git a/src/MPI_download_prep.py b/src/MPI_download_prep.py deleted file mode 100644 index a5c4d62..0000000 --- a/src/MPI_download_prep.py +++ /dev/null @@ -1,106 +0,0 @@ -import argparse -import os -import shutil - -import pandas as pd -from pandas._libs.missing import NAType - -from mpi_downloader.dataclasses import profile_dtype -from mpi_downloader.utils import verify_downloaded_batches, verify_batches_for_prep -from utils.utils import ensure_created, create_schedule_configs - -_DEFAULT_RATE_LIMIT = 10 -_CREATE_PROFILES = False -_DOWNLOADER_URLS_FOLDER = os.getenv("DOWNLOADER_URLS_FOLDER", "servers_batched") -_DOWNLOADER_LOGS_FOLDER = os.getenv("DOWNLOADER_LOGS_FOLDER", "logs") -_DOWNLOADER_IMAGES_FOLDER = os.getenv("DOWNLOADER_IMAGES_FOLDER", "downloaded_images") -_DOWNLOADER_SCHEDULES_FOLDER = os.getenv("DOWNLOADER_SCHEDULES_FOLDER", "schedules") -_DOWNLOADER_PROFILES_PATH = os.getenv("DOWNLOADER_PROFILES_PATH", "servers_profiles.csv") -_DOWNLOADER_IGNORED_PATH = os.getenv("DOWNLOADER_IGNORED_PATH", "ignored_servers.csv") - - -def small_rule(total_batches: int) -> int | NAType: - if total_batches >= 5000: - return 40 - elif total_batches >= 1000: - return 20 - elif total_batches >= 500: - return 10 - elif total_batches >= 200: - return 8 - elif total_batches >= 100: - return 4 - elif total_batches >= 50: - return 2 - elif total_batches >= 1: - return 1 - - return pd.NA - - -parser = argparse.ArgumentParser(description='Server downloader prep') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to folder with download components (e.g., image folder, server profiles, and schedule)') -parser.add_argument('max_nodes', metavar='max_nodes', type=int, help='the max number of nodes to use for download') -parser.add_argument('max_workers_per_nodes', metavar='max_workers_per_nodes', type=int, - help='the max number of workers per node to use for download') - -# parse the arguments -_args = parser.parse_args() -Input_path: str = _args.input_path -Number_of_workers: int = _args.max_nodes * _args.max_workers_per_nodes - -Server_urls_batched = f"{Input_path}/{_DOWNLOADER_URLS_FOLDER}" -Server_profiler_csv = f"{Input_path}/{_DOWNLOADER_PROFILES_PATH}" -Server_ignored_csv = f"{Input_path}/{_DOWNLOADER_IGNORED_PATH}" -Server_schedules_path = f"{Input_path}/{_DOWNLOADER_SCHEDULES_FOLDER}" - -ensure_created([ - Server_urls_batched, - f"{Input_path}/{_DOWNLOADER_LOGS_FOLDER}", - f"{Input_path}/{_DOWNLOADER_IMAGES_FOLDER}", - Server_schedules_path, -]) - -server_list = os.listdir(Server_urls_batched) -server_count = len(server_list) - -profile_csv = [] -for i, server in enumerate(server_list): - if not os.path.isdir(f"{Server_urls_batched}/{server}"): - continue - - server_name = server.split("=")[1] - server_total_partitions = len(os.listdir(f"{Server_urls_batched}/{server}")) - profile_csv.append([server_name, server_total_partitions, 0, 0, _DEFAULT_RATE_LIMIT]) - -profiles_df = pd.DataFrame(profile_csv, columns=profile_dtype.names) -if _CREATE_PROFILES: - profiles_df.to_csv(Server_profiler_csv, index=False, header=True) - -if os.path.exists(Server_ignored_csv): - ignored_servers_df = pd.read_csv(Server_ignored_csv) -else: - ignored_servers_df = pd.DataFrame(columns=["ServerName"]) - -if len(os.listdir(f"{Input_path}/{_DOWNLOADER_IMAGES_FOLDER}")) > 0: - downloaded_batches: pd.DataFrame = verify_batches_for_prep(profiles_df, f"{Input_path}/{_DOWNLOADER_IMAGES_FOLDER}") - downloaded_batches = downloaded_batches.groupby("ServerName").count().reset_index().dropna() - downloaded_batches = downloaded_batches.rename(columns={"ServerName": "server_name", "Status": "already_downloaded"}) - profiles_df = profiles_df.merge(downloaded_batches, on="server_name", how="left").fillna(0) - profiles_df["left_to_download"] = profiles_df["total_batches"] - profiles_df["already_downloaded"] -else: - profiles_df["left_to_download"] = profiles_df["total_batches"] - -profiles_df["Nodes"] = profiles_df["left_to_download"].apply(small_rule) -profiles_df["ProcessPerNode"] = 1 -profiles_df = profiles_df.rename(columns={"total_batches": "TotalBatches"}).dropna().reset_index(drop=True) -profiles_df = profiles_df[["ServerName", "TotalBatches", "ProcessPerNode", "Nodes"]] -profiles_df = profiles_df[~profiles_df["ServerName"].isin(ignored_servers_df["ServerName"])] - -shutil.rmtree(Server_schedules_path, ignore_errors=True) -os.makedirs(Server_schedules_path, exist_ok=True) - -create_schedule_configs(profiles_df, Number_of_workers, Server_schedules_path) - -print("Done") diff --git a/src/MPI_downloader_verifier.py b/src/MPI_downloader_verifier.py deleted file mode 100644 index 4b9df77..0000000 --- a/src/MPI_downloader_verifier.py +++ /dev/null @@ -1,96 +0,0 @@ -import argparse -import os - -import pandas as pd - -from mpi_downloader.utils import get_latest_schedule, verify_downloaded_batches - -parser = argparse.ArgumentParser(description='Server downloader verifier') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to folder with download components (e.g., image folder and server profiles)') -parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") - -parser.add_argument('--recheck', action='store_true', help='recheck the downloaded batches') - -# parse the arguments -_args = parser.parse_args() -Input_path: str = _args.input_path -Server_schedule: str = _args.schedule_path -config_file: str = f"{Server_schedule}/_config.csv" -verification_file: str = f"{Server_schedule}/_verification.csv" -Recheck = _args.recheck - -Server_urls_downloaded = f"{Input_path}/downloaded_images" - -if not os.path.exists(config_file): - raise ValueError(f"Config file {config_file} not found") - -if Recheck: - Verification_df = pd.DataFrame(columns=["ServerName", "PartitionId", "Status"]) -else: - if os.path.exists(verification_file): - Verification_df = pd.read_csv(verification_file) - else: - Verification_df = pd.DataFrame(columns=["ServerName", "PartitionId", "Status"]) - -Verification_original_df = Verification_df.copy() - -Server_profiler_df = pd.read_csv(f"{Input_path}/servers_profiles.csv") - -latest_schedule = get_latest_schedule(Server_schedule) -Server_config_df = pd.read_csv(config_file) -Server_config_df["StartIndex"] = 0 -Server_config_df["EndIndex"] = 0 -server_config_columns = Server_config_df.columns.to_list() -Server_config_df = Server_config_df.merge(Server_profiler_df, - left_on="ServerName", - right_on="server_name", - how="left", - validate="1:1") - -Server_config_df["EndIndex"] = Server_config_df["total_batches"] - 1 - -Server_config_df = Server_config_df[server_config_columns] - -Latest_schedule = get_latest_schedule(Server_schedule) -if Latest_schedule is not None and len(Latest_schedule) > 0: - Latest_schedule_aggr = Latest_schedule.groupby("ServerName").agg( - {"PartitionIdFrom": "min", "PartitionIdTo": "max"}).reset_index() - Server_config_df = Server_config_df.merge(Latest_schedule_aggr, on="ServerName", how="left") - Server_config_df["StartIndex"] = Server_config_df["PartitionIdFrom"] - Server_config_df = Server_config_df[server_config_columns] - -for idx, row in Server_config_df.iterrows(): - new_verification_df = verify_downloaded_batches(row, Server_urls_downloaded) - Verification_df = pd.concat([Verification_df, pd.DataFrame(new_verification_df)], ignore_index=True).drop_duplicates() - -Verification_df.to_csv(verification_file, index=False, header=True) - -# verification_unchanged = Verification_df.equals(Verification_original_df) -# unchanged_warning = os.path.exists(f"{Server_schedule}/_UNCHANGED") -# -# if unchanged_warning: -# if verification_unchanged: -# os.remove(f"{Server_schedule}/_UNCHANGED") -# raise ValueError("Infinite Loop") -# else: -# os.remove(f"{Server_schedule}/_UNCHANGED") -# else: -# if verification_unchanged: -# open(f"{Server_schedule}/_UNCHANGED", "w").close() - - -downloaded_count = Verification_df.groupby("ServerName").agg({"Status": "count"}).reset_index() -downloaded_count = downloaded_count.rename(columns={"Status": "Downloaded"}) -downloaded_count = downloaded_count.merge(Server_config_df, on="ServerName", how="outer") -downloaded_count["Downloaded"] = downloaded_count["Downloaded"].fillna(0) -downloaded_count = downloaded_count[["ServerName", "Downloaded"]] -downloaded_count = downloaded_count.merge(Server_profiler_df, left_on="ServerName", right_on="server_name", how="left") -downloaded_count = downloaded_count[["ServerName", "total_batches", "Downloaded"]] -downloaded_count = downloaded_count[downloaded_count["Downloaded"] < downloaded_count["total_batches"]] - -if len(downloaded_count) > 0: - print("Need more jobs") -else: - print("All servers have downloaded all the batches") - open(f"{Server_schedule}/_DONE", "w").close() diff --git a/src/MPI_multimedia_downloader.py b/src/MPI_multimedia_downloader.py deleted file mode 100644 index e34e324..0000000 --- a/src/MPI_multimedia_downloader.py +++ /dev/null @@ -1,121 +0,0 @@ -import argparse -import logging -import os -import time -from typing import Dict, Tuple - -import mpi4py.MPI as MPI - -from mpi_downloader import DirectWriter, CompletedBatch -from mpi_downloader.dataclasses import CompletedBatch -from mpi_downloader.Downloader import Downloader -from mpi_downloader.PreLoader import load_one_batch -from mpi_downloader.utils import get_latest_schedule, \ - get_or_init_downloader, is_enough_time - - -def download_batch( - _downloader: Downloader, - _input_path: str, - _batch_id: int, -) -> Tuple[CompletedBatch, float]: - batch = load_one_batch(_input_path) - - _completed_batch, _finish_rate = _downloader.get_images(batch) - - return _completed_batch, _finish_rate - - -parser = argparse.ArgumentParser(description='Server downloader') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to with download components (e.g., image folder and server batches)') -parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") -parser.add_argument("--header", required=True, type=str, help="the requests header") -parser.add_argument("--img-size", required=True, type=int, help="the max side-length of an image in pixels") -parser.add_argument("--rate-multiplier", required=False, type=float, help="the rate multiplier", default=0.5) -parser.add_argument("--logging-level", required=False, type=str, help="the logging level", default="INFO") - -# parse the arguments -_args = parser.parse_args() -header_str = _args.header -header = {header_str.split(": ")[0]: header_str.split(": ")[1]} -img_size = _args.img_size -Input_path: str = _args.input_path -Server_urls_batched = f"{Input_path}/servers_batched" -Server_downloader_output = f"{Input_path}/downloaded_images" -Server_schedule: str = _args.schedule_path -_RATE_MULTIPLIER: float = _args.rate_multiplier -logging_level: str = _args.logging_level - -logging.basicConfig(level=logging.getLevelName(logging_level), format="%(asctime)s - %(levelname)s - %(process)d - %(message)s") -logger = logging.getLogger(__name__) - -comm = MPI.COMM_WORLD -rank = comm.rank -mem = MPI.Alloc_mem(1) -window = MPI.Win.Create(mem, comm=comm) -comm.Barrier() - -try: - logger.info(f"Rank {rank} started, getting latest schedule") - Latest_schedule = get_latest_schedule(Server_schedule, rank) - - if Latest_schedule is None or len(Latest_schedule) < 1: - raise ValueError(f"Rank {rank} not found in the scheduler") - - Latest_schedule = Latest_schedule.to_dict("records") - job_end_time: int = int(os.getenv("SLURM_JOB_END_TIME", 0)) - - downloader_schedule: Dict[str, Tuple] = {} - - downloading_time = 0 - writing_time = 0 - - logger.info(f"Rank {rank} started downloading") - - for schedule_dict in Latest_schedule: - downloader, _, rate_limit = get_or_init_downloader(header, - img_size, - schedule_dict, - downloader_schedule, - _RATE_MULTIPLIER, - job_end_time, - logger) - - for batch_id in range(schedule_dict["PartitionIdFrom"], schedule_dict["PartitionIdTo"]): - window.Lock(schedule_dict["MainRank"], MPI.LOCK_EXCLUSIVE) - try: - if not is_enough_time(rate_limit, job_end_time=job_end_time): - raise TimeoutError("Not enough time to download batch") - - logger.info(f"Rank {rank} started downloading batch {batch_id} of {schedule_dict['ServerName']}") - - t0 = time.perf_counter() - - input_path = f"{Server_urls_batched}/ServerName={schedule_dict['ServerName']}/partition_id={batch_id}" - output_path = f"{Server_downloader_output}/ServerName={schedule_dict['ServerName']}/partition_id={batch_id}" - completed_batch, finish_rate = download_batch(downloader, input_path, batch_id) - rate_limit.change_rate(finish_rate) - - downloading_time += time.perf_counter() - t0 - - logger.info(f"Rank {rank} finished downloading batch {batch_id} of {schedule_dict['ServerName']}") - except Exception as e: - window.Unlock(schedule_dict["MainRank"]) - raise e - else: - window.Unlock(schedule_dict["MainRank"]) - - t0 = time.perf_counter() - DirectWriter.write_batch(completed_batch, output_path, job_end_time, logger=logger) - logger.info(f"Rank {rank} finished writing batch {batch_id} of {schedule_dict['ServerName']}") - - writing_time += time.perf_counter() - t0 - - logger.info(f"Rank {rank} spent {downloading_time} seconds downloading and {writing_time} seconds writing") -except Exception as e: - logger.error(f"Rank {rank} failed with error: {e}") -finally: - # comm.Barrier() - window.Free() - mem.release() diff --git a/src/MPI_multimedia_downloader_controller.py b/src/MPI_multimedia_downloader_controller.py deleted file mode 100644 index 20fd95f..0000000 --- a/src/MPI_multimedia_downloader_controller.py +++ /dev/null @@ -1,103 +0,0 @@ -import argparse -from collections import deque -from typing import Any, Dict, List, Deque - -import pandas as pd - -from mpi_downloader.utils import get_latest_schedule, generate_ids_to_download, separate_to_blocks, \ - get_largest_nonempty_bucket, get_schedule_count - -parser = argparse.ArgumentParser(description='Server downloader controller') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to folder with download components (e.g., image folder and server profiles)') -parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") -parser.add_argument('max_nodes', metavar='max_nodes', type=int, help='max number of nodes') -parser.add_argument('max_workers_per_nodes', metavar='max_workers_per_nodes', type=int, - help='max number of workers per node') - -# parse the arguments -_args = parser.parse_args() -Input_path: str = _args.input_path -Server_schedule: str = _args.schedule_path -Number_of_workers: int = _args.max_nodes * _args.max_workers_per_nodes - -Server_urls_downloaded = f"{Input_path}/downloaded_images" - -Server_profiler_df = pd.read_csv(f"{Input_path}/servers_profiles.csv") -Server_verifier_df = pd.read_csv(f"{Server_schedule}/_verification.csv") -Server_config_df = pd.read_csv(f"{Server_schedule}/_config.csv") - -Server_config_df["StartIndex"] = 0 -Server_config_df["EndIndex"] = 0 -server_config_columns = Server_config_df.columns.to_list() -Server_config_df = Server_config_df.merge(Server_profiler_df, left_on="ServerName", right_on="server_name", how="left") -Server_config_df["EndIndex"] = Server_config_df["total_batches"] - 1 -Server_config_df = Server_config_df[server_config_columns] - -Latest_schedule = get_latest_schedule(Server_schedule) -if Latest_schedule is not None and len(Latest_schedule) > 0: - Latest_schedule_aggr = Latest_schedule.groupby("ServerName").agg( - {"PartitionIdFrom": "min", "PartitionIdTo": "max"}).reset_index() - Server_config_df = Server_config_df.merge(Latest_schedule_aggr, on="ServerName", how="left") - Server_config_df = Server_config_df.fillna(0) - Server_config_df["StartIndex"] = Server_config_df["PartitionIdFrom"].astype(int) - Server_config_df = Server_config_df[server_config_columns] - -batches_to_download: pd.DataFrame = Server_config_df.apply(generate_ids_to_download, axis=1, - args=(Server_verifier_df,)) -batches_to_download = batches_to_download.merge(Server_config_df, on="ServerName", how="left").drop( - columns=["StartIndex", "EndIndex"]) -batches_to_download["Batches"] = batches_to_download.apply(separate_to_blocks, axis=1) - -batches_to_download.sort_values(by=["ProcessPerNode", "Nodes"], inplace=True, ascending=False) - -ids_to_schedule_in_buckets: Dict[int, Deque[Dict[str, Any]]] = {} -process_per_nodes = batches_to_download["ProcessPerNode"].unique() -for process_per_node in process_per_nodes: - ids_to_schedule_in_buckets[process_per_node] = deque( - batches_to_download[batches_to_download["ProcessPerNode"] == process_per_node].to_dict("records")) - -print(ids_to_schedule_in_buckets) - -schedule_list: List[Dict[str, Any]] = [] -worker_id = 0 - -while len(ids_to_schedule_in_buckets) != 0: - worker_id = worker_id % Number_of_workers - largest_key = get_largest_nonempty_bucket( - ids_to_schedule_in_buckets, - Number_of_workers - worker_id - ) - - if largest_key == 0: - worker_id = 0 - continue - - current_server = ids_to_schedule_in_buckets[largest_key].popleft() - current_server["Nodes"] -= 1 - server_rate_limit = Server_profiler_df[Server_profiler_df["server_name"] == current_server["ServerName"]][ - "rate_limit"].array[0] - - if len(current_server["Batches"]) > 0: - batches_to_schedule = [current_server["Batches"].pop(0) for _ in range(current_server["ProcessPerNode"])] - main_worker_id = worker_id - for batches in batches_to_schedule: - for batch in batches: - schedule_list.append({ - "Rank": worker_id, - "ServerName": current_server["ServerName"], - "PartitionIdFrom": batch[0], - "PartitionIdTo": batch[1], - "MainRank": main_worker_id, - "RateLimit": server_rate_limit, - }) - worker_id += 1 - - if current_server["Nodes"] > 0: - ids_to_schedule_in_buckets[largest_key].append(current_server) - - if len(ids_to_schedule_in_buckets[largest_key]) == 0: - del ids_to_schedule_in_buckets[largest_key] - -schedule_number = get_schedule_count(Server_schedule) -pd.DataFrame(schedule_list).to_csv(f"{Server_schedule}/{schedule_number:0=4}.csv", index=False, header=True) diff --git a/src/MPI_server_profiler.py b/src/MPI_server_profiler.py deleted file mode 100644 index 9a41d4e..0000000 --- a/src/MPI_server_profiler.py +++ /dev/null @@ -1,102 +0,0 @@ -import argparse -import re -from queue import Queue -from typing import Dict, Tuple - -import h5py -import mpi4py.MPI as MPI -import pandas as pd - -from mpi_downloader import CompletedBatch, ProfilerWriter -from mpi_downloader.Downloader import Downloader -from mpi_downloader.PreLoader import load_one_batch -from mpi_downloader.dataclasses import RateLimit -from mpi_downloader.utils import create_new_session - -Initial_rate = 20 -Rate_multiplier = 10 -Time_to_profile = 2 - -parser = argparse.ArgumentParser(description='Server profiler') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to folder with download components (e.g., server batches, profiles, and errors)') -parser.add_argument('batch_size', metavar='batch_size', type=int, help='size of the batch to download') -parser.add_argument("--header", required=True, type=str, help="the requests header") -parser.add_argument("--img-size", required = True, type=int, help="the max side-length of an image in pixels") - -# parse the arguments -_args = parser.parse_args() -header_str = _args.header -header = {header_str.split(": ")[0]: header_str.split(": ")[1]} -img_size = _args.img_size -Input_path: str = _args.input_path -Server_urls_batched = f"{Input_path}/servers_batched" -Server_profiler_hdf = f"{Input_path}/servers_profiles.hdf5" -Server_errors_hdf = f"{Input_path}/servers_errors.hdf5" -Server_profile_spec = f"{Input_path}/profile_spec.csv" -Batch_size: int = _args.batch_size - -rank = MPI.COMM_WORLD.rank - -scheduler_df = pd.read_csv(Server_profile_spec) -scheduler_dicts = scheduler_df[scheduler_df["Rank"] == rank].to_dict("records") - -if len(scheduler_dicts) < 1: - raise ValueError(f"Rank {rank} not found in the scheduler") - -downloader_schedule: Dict[str, Tuple] = {} -profiles_hdf = h5py.File(Server_profiler_hdf, 'r+', driver='mpio', comm=MPI.COMM_WORLD) -errors_hdf = h5py.File(Server_errors_hdf, 'r+', driver='mpio', comm=MPI.COMM_WORLD) - - -def download_batch( - _downloader: Downloader, - _rate_limit: RateLimit, - _input_path: str, -) -> Tuple[CompletedBatch, float]: - batch = load_one_batch(_input_path) - - _completed_batch, finish_rate = _downloader.get_images(batch, _rate_limit) - return _completed_batch, finish_rate - - -for idx, schedule_dict in enumerate(scheduler_dicts): - if schedule_dict["ServerName"] not in downloader_schedule.keys(): - server_name = re.sub(':', '%3A', schedule_dict["ServerName"]) - rate_limit = RateLimit(Initial_rate, Rate_multiplier) - session = create_new_session(server_name, rate_limit.upper_bound) - downloader = Downloader(header, session, rate_limit, img_size, False) - downloader_schedule[schedule_dict["ServerName"]] = (downloader, rate_limit) - - downloader, rate_limit = downloader_schedule[schedule_dict["ServerName"]] - completed_batch_final: CompletedBatch = CompletedBatch(Queue(), Queue()) - new_rate_limit_final: float = 0 - - for _ in range(Time_to_profile): - completed_batch_final = CompletedBatch(Queue(), Queue()) - print(f"Rank {rank} starting batch 0|{rate_limit}|{schedule_dict['ServerName']}") - - input_path = f"{Server_urls_batched}/ServerName={schedule_dict['ServerName']}/partition_id=0" - completed_batch, new_rate_limit = download_batch(downloader, rate_limit, input_path) - rate_limit.change_rate(new_rate_limit) - - completed_batch_final = completed_batch - new_rate_limit_final = new_rate_limit - - print(f"Rank {rank} finished batch 0|{schedule_dict['ServerName']}") - - ProfilerWriter.write_batch( - profiles_hdf["profiles"], - errors_hdf["errors"], - completed_batch_final, - new_rate_limit_final, - rank, - schedule_dict['Offset'], - Batch_size, - schedule_dict['ServerName'], - schedule_dict["BatchesCount"], - Input_path - ) - -profiles_hdf.close() -errors_hdf.close() diff --git a/src/MPI_server_profiler_controller.py b/src/MPI_server_profiler_controller.py deleted file mode 100644 index 9449bf1..0000000 --- a/src/MPI_server_profiler_controller.py +++ /dev/null @@ -1,69 +0,0 @@ -import argparse -import os -import shutil - -import h5py -import mpi4py.MPI as MPI -import pandas as pd - -from mpi_downloader.dataclasses import error_dtype, profile_dtype - -_DEFAULT_RATE_LIMIT = 10 - -parser = argparse.ArgumentParser(description='Server profiler') - -parser.add_argument('input_path', metavar='input_path', type=str, help='the path to folder with download components (e.g., server batches, profiles, and errors)') -parser.add_argument('max_nodes', metavar='max_nodes', type=int, help='the max number of nodes to use for download') -parser.add_argument('max_workers_per_nodes', metavar='max_workers_per_nodes', type=int, - help='the max number of workers per node') -parser.add_argument('batch_size', metavar='batch_size', type=int, help='size of the batch to download') - -# parse the arguments -_args = parser.parse_args() -Input_path: str = _args.input_path -Server_urls_batched = f"{Input_path}/servers_batched" -Server_profiler_hdf = f"{Input_path}/servers_profiles.hdf5" -Server_errors_hdf = f"{Input_path}/servers_errors.hdf5" -Server_profile_spec = f"{Input_path}/profile_spec.csv" -Server_profiler_csv = f"{Input_path}/servers_profiles.csv" -Server_samples = f"{Input_path}/samples" -Number_of_workers: int = _args.max_nodes * _args.max_workers_per_nodes -Batch_size: int = _args.batch_size - -if os.path.exists(Server_samples) or os.path.isdir(Server_samples): - shutil.rmtree(Server_samples) -os.makedirs(Server_samples) - -server_list = os.listdir(Server_urls_batched) -server_count = len(server_list) -print("Counted all servers") - -with h5py.File(Server_errors_hdf, 'w', driver='mpio', comm=MPI.COMM_WORLD) as errors_hdf: - errors = errors_hdf.create_dataset("errors", - (server_count * Batch_size,), - chunks=(Batch_size,), - dtype=error_dtype, - ) -with h5py.File(Server_profiler_hdf, 'w', driver='mpio', comm=MPI.COMM_WORLD) as profiles_hdf: - profiles = profiles_hdf.create_dataset("profiles", - (server_count,), - dtype=profile_dtype, - ) - profile_spec = [] - profile_csv = [] - for i, server in enumerate(server_list): - if not os.path.isdir(f"{Server_urls_batched}/{server}"): - continue - - server_name = server.split("=")[1] - server_total_partitions = len(os.listdir(f"{Server_urls_batched}/{server}")) - profile_spec.append([i % Number_of_workers, server_name, server_total_partitions]) - profile_csv.append([server_name, server_total_partitions, 0, 0, _DEFAULT_RATE_LIMIT]) - profiles[i] = (server_name, server_total_partitions, 0, 0, _DEFAULT_RATE_LIMIT) - -print("created df") - -profile_spec_df = pd.DataFrame(profile_spec, columns=["Rank", "ServerName", "BatchesCount"]) -profile_spec_df.to_csv(Server_profile_spec, index=True, index_label="Offset", header=True) - -pd.DataFrame(profile_csv, columns=profile_dtype.names).to_csv(Server_profiler_csv, index=False, header=True) diff --git a/src/distributed_downloader/__init__.py b/src/distributed_downloader/__init__.py new file mode 100644 index 0000000..9582b33 --- /dev/null +++ b/src/distributed_downloader/__init__.py @@ -0,0 +1,5 @@ +from .core import download_images +from .tools import apply_tools + + +__all__ = ["download_images", "apply_tools"] diff --git a/src/distributed_downloader/core/MPI_download_prep.py b/src/distributed_downloader/core/MPI_download_prep.py new file mode 100644 index 0000000..a05cc56 --- /dev/null +++ b/src/distributed_downloader/core/MPI_download_prep.py @@ -0,0 +1,186 @@ +import os +from logging import Logger +from typing import Dict, List, Tuple + +import pandas as pd +from pandas._libs.missing import NAType + +from distributed_downloader.tools.checkpoint import Checkpoint +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.utils import submit_job, init_logger, preprocess_dep_ids +from distributed_downloader.core.utils import create_schedule_configs, verify_batches_for_prep + + +def schedule_rule(total_batches: int, rule: List[Tuple[int, int]]) -> int | NAType: + for min_batches, nodes in rule: + if total_batches >= min_batches: + return nodes + return pd.NA + + +def init_new_current_folder(old_folder: str) -> None: + if os.path.exists(f"{old_folder}/current"): + number_of_folders = len( + [folder for folder in os.listdir(old_folder) if os.path.isdir(f"{old_folder}/{folder}")]) + new_name = str(number_of_folders).zfill(4) + os.rename(f"{old_folder}/current", f"{old_folder}/{new_name}") + os.mkdir(f"{old_folder}/current") + + +def fix_rule(rule: Dict[str, int]) -> List[Tuple[int, int]]: + fixed_rule = [] + for key, value in rule.items(): + fixed_rule.append((int(key), value)) + fixed_rule.sort(key=lambda x: x[0], reverse=True) + return fixed_rule + + +def submit_downloader(_schedule: str, + iteration_id: int, + dep_id: int, + mpi_submitter_script: str, + downloading_script: str) -> int: + iteration = str(iteration_id).zfill(4) + + idx = submit_job(mpi_submitter_script, + downloading_script, + _schedule, + iteration, + *preprocess_dep_ids([dep_id])) + + return idx + + +def submit_verifier(_schedule: str, + iteration_id: int, + mpi_submitter_script: str, + verifying_script: str, + dep_id: int = None) -> int: + iteration = str(iteration_id).zfill(4) + + idx = submit_job(mpi_submitter_script, + verifying_script, + _schedule, + iteration, + *preprocess_dep_ids([dep_id])) + + return idx + + +def create_schedules(config: Config, logger: Logger) -> None: + logger.info("Creating schedules") + # Get parameters from config + server_ignored_csv: str = config.get_folder("ignored_table") + schedules_path: str = os.path.join(config.get_folder("schedules_folder"), + "current") + server_profiler_csv: str = config.get_folder("profiles_table") + downloaded_images_path: str = config.get_folder("images_folder") + number_of_workers: int = (config['downloader_parameters']['max_nodes'] + * config['downloader_parameters']['workers_per_node']) + schedule_rule_dict: List[Tuple[int, int]] = fix_rule(config['schedule_rules']) + + # Get list to download + profiles_df = pd.read_csv(server_profiler_csv) + + if os.path.exists(server_ignored_csv) and os.stat(server_ignored_csv).st_size != 0: + ignored_servers_df = pd.read_csv(server_ignored_csv) + else: + ignored_servers_df = pd.DataFrame(columns=["server_name"]) + + if os.path.exists(schedules_path) and len(os.listdir(schedules_path)) > 0: + downloaded_batches: pd.DataFrame = verify_batches_for_prep(profiles_df, downloaded_images_path) + downloaded_batches = downloaded_batches.groupby("server_name").count().reset_index().dropna() + downloaded_batches = downloaded_batches.rename( + columns={"status": "already_downloaded"}) + profiles_df = profiles_df.merge(downloaded_batches, on="server_name", how="left", validate="1:1").fillna(0) + profiles_df["left_to_download"] = profiles_df["total_batches"] - profiles_df["already_downloaded"] + else: + profiles_df["left_to_download"] = profiles_df["total_batches"] + + profiles_df["nodes"] = profiles_df["left_to_download"].apply(lambda x: schedule_rule(x, schedule_rule_dict)) + profiles_df["process_per_node"] = 1 + profiles_df = (profiles_df + .dropna() + .reset_index(drop=True)) + profiles_df = profiles_df[["server_name", "total_batches", "process_per_node", "nodes"]] + profiles_df = profiles_df.loc[:, ~profiles_df.columns.duplicated()].copy() + profiles_df = profiles_df[~profiles_df["server_name"].isin(ignored_servers_df["server_name"])] + + # Rename old schedule and logs + init_new_current_folder(config.get_folder("schedules_folder")) + init_new_current_folder(config.get_folder("logs_folder")) + + # Create schedules + create_schedule_configs(profiles_df, number_of_workers, schedules_path) + logger.info("Schedules created") + + +def submit_downloaders(config: Config, logger: Logger) -> None: + logger.info("Submitting downloaders") + # Get parameters from config + schedules_path: str = os.path.join(config.get_folder("schedules_folder"), + "current") + mpi_submitter_script: str = config.get_script("mpi_submitter") + downloading_script: str = config.get_script('download_script') + verifying_script: str = config.get_script('verify_script') + + # Schedule downloaders + for schedule in os.listdir(schedules_path): + submission_records = [] + offset = 0 + download_id = None + + for _ in range(config["downloader_parameters"]["num_downloads"]): + download_id = submit_downloader(schedule, + offset, + download_id, + mpi_submitter_script, + downloading_script) + submission_records.append({ + "job_id": download_id, + "is_verification": False + }) + logger.info(f"Submitted downloader {download_id} for {schedule}") + + offset += 1 + + verifier_id = submit_verifier(schedule, + offset, + mpi_submitter_script, + verifying_script, + download_id) + submission_records.append({ + "job_id": verifier_id, + "is_verification": True + }) + logger.info(f"Submitted verifier {verifier_id} for {schedule}") + offset += 1 + + pd.DataFrame(submission_records).to_csv(os.path.join(schedules_path, schedule, "_jobs_ids.csv"), + index=False, + header=True) + + logger.info("All downloading scripts submitted") + + +def main(): + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + logger = init_logger(__name__) + + inner_checkpoint_path: str = config.get_folder("inner_checkpoint_file") + if not os.path.exists(inner_checkpoint_path): + raise FileNotFoundError(f"Inner checkpoint file {inner_checkpoint_path} not found") + inner_checkpoint = Checkpoint.from_path(inner_checkpoint_path, {"schedule_creation_scheduled": False}) + + create_schedules(config, logger) + submit_downloaders(config, logger) + + inner_checkpoint["schedule_creation_scheduled"] = False + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/core/MPI_downloader_verifier.py b/src/distributed_downloader/core/MPI_downloader_verifier.py new file mode 100644 index 0000000..ff5f98a --- /dev/null +++ b/src/distributed_downloader/core/MPI_downloader_verifier.py @@ -0,0 +1,110 @@ +import argparse +import os +from logging import Logger + +import pandas as pd + +from distributed_downloader.core.mpi_downloader.utils import get_latest_schedule +from distributed_downloader.core.utils import verify_downloaded_batches +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.utils import init_logger + + +def verify_batches(config: Config, + server_schedule: str, + logger: Logger) -> None: + logger.info(f"Verifying batches for {server_schedule}") + + server_urls_downloaded = config.get_folder("images_folder") + server_profiler_path = config.get_folder("profiles_table") + + config_file: str = f"{server_schedule}/_config.csv" + verification_file: str = f"{server_schedule}/_verification.csv" + + if not os.path.exists(config_file): + raise ValueError(f"Schedule config file {config_file} not found") + + if os.path.exists(verification_file): + verification_df = pd.read_csv(verification_file) + else: + verification_df = pd.DataFrame(columns=["server_name", "partition_id", "status"]) + + verification_original_df = verification_df.copy() + + server_profiler_df = pd.read_csv(server_profiler_path) + + latest_schedule = get_latest_schedule(server_schedule) + server_config_df = pd.read_csv(config_file) + server_config_df["start_index"] = 0 + server_config_df["end_index"] = 0 + server_config_columns = server_config_df.columns.to_list() + server_config_df = server_config_df.merge(server_profiler_df, + on="server_name", + how="left", + validate="1:1", + suffixes=("", "_y")) + + server_config_df["end_index"] = server_config_df["total_batches"] - 1 + server_config_df = server_config_df[server_config_columns] + + if latest_schedule is not None and len(latest_schedule) > 0: + latest_schedule_aggr = latest_schedule.groupby("server_name").agg( + {"partition_id_from": "min", "partition_id_to": "max"}).reset_index() + server_config_df = server_config_df.merge(latest_schedule_aggr, on="server_name", how="left") + server_config_df["start_index"] = server_config_df["partition_id_from"] + server_config_df = server_config_df[server_config_columns] + + for idx, row in server_config_df.iterrows(): + new_verification_df = verify_downloaded_batches(row, server_urls_downloaded) + verification_df = pd.concat([verification_df, pd.DataFrame(new_verification_df)], + ignore_index=True).drop_duplicates() + + verification_df.to_csv(verification_file, index=False, header=True) + + logger.info(f"Verification done for {server_schedule}") + + if (verification_df.equals(verification_original_df) + and len(verification_df) > 0 + and not os.path.exists(f"{server_schedule}/_DONE")): + logger.debug(f"Verification unchanged for {server_schedule}") + open(f"{server_schedule}/_UNCHANGED", "w").close() + + downloaded_count = verification_df.groupby("server_name").agg({"status": "count"}).reset_index() + downloaded_count = downloaded_count.rename(columns={"status": "downloaded"}) + downloaded_count = downloaded_count.merge(server_config_df, on="server_name", how="outer") + downloaded_count["downloaded"] = downloaded_count["downloaded"].fillna(0) + downloaded_count = downloaded_count[["server_name", "downloaded"]] + downloaded_count = downloaded_count.merge(server_profiler_df, + on="server_name", + how="left") + downloaded_count = downloaded_count[["server_name", "total_batches", "downloaded"]] + downloaded_count = downloaded_count[downloaded_count["downloaded"] < downloaded_count["total_batches"]] + + if len(downloaded_count) > 0: + logger.info(f"Still {len(downloaded_count)} servers have not downloaded all the batches") + else: + logger.info("All servers have downloaded all the batches") + open(f"{server_schedule}/_DONE", "w").close() + + +def main(): + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Server downloader verifier') + parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") + _args = parser.parse_args() + + verify_batches( + config, + _args.schedule_path, + logger + ) + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/core/MPI_multimedia_downloader.py b/src/distributed_downloader/core/MPI_multimedia_downloader.py new file mode 100644 index 0000000..5c11c13 --- /dev/null +++ b/src/distributed_downloader/core/MPI_multimedia_downloader.py @@ -0,0 +1,138 @@ +import argparse +import logging +import os +import time +from typing import Dict, Tuple + +import mpi4py.MPI as MPI + +from distributed_downloader.core.mpi_downloader import DirectWriter +from distributed_downloader.core.mpi_downloader.dataclasses import CompletedBatch +from distributed_downloader.core.mpi_downloader.Downloader import Downloader +from distributed_downloader.core.mpi_downloader.PreLoader import load_one_batch +from distributed_downloader.core.mpi_downloader.utils import get_latest_schedule, \ + get_or_init_downloader, is_enough_time +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.utils import init_logger + + +def download_batch( + _downloader: Downloader, + _input_path: str, + _batch_id: int, +) -> Tuple[CompletedBatch, float]: + batch = load_one_batch(_input_path) + + _completed_batch, _finish_rate = _downloader.get_images(batch) + + return _completed_batch, _finish_rate + + +def download_schedule( + config: Config, + server_schedule: str, + logger: logging.Logger, +): + header_str = config["downloader_parameters"]["header"] + header = {header_str.split(": ")[0]: header_str.split(": ")[1]} + img_size = config["downloader_parameters"]["image_size"] + server_urls_batched = config.get_folder("urls_folder") + server_downloader_output = config.get_folder("images_folder") + _RATE_MULTIPLIER: float = config["downloader_parameters"]["rate_multiplier"] + + if os.path.exists(f"{server_schedule}/_DONE"): + logger.info(f"Schedule {server_schedule} already done") + return + + comm = MPI.COMM_WORLD + rank = comm.rank + mem = MPI.Alloc_mem(1) + window = MPI.Win.Create(mem, comm=comm) + comm.Barrier() + + try: + logger.info(f"Rank {rank} started, getting latest schedule") + latest_schedule = get_latest_schedule(server_schedule, rank) + + if latest_schedule is None or len(latest_schedule) < 1: + raise ValueError(f"Rank {rank} not found in the scheduler") + + latest_schedule = latest_schedule.to_dict("records") + job_end_time: int = int(os.getenv("SLURM_JOB_END_TIME", 0)) + + downloader_schedule: Dict[str, Tuple] = {} + + downloading_time = 0 + writing_time = 0 + + logger.info(f"Rank {rank} started downloading") + + for schedule_dict in latest_schedule: + downloader, _, rate_limit = get_or_init_downloader(header, + img_size, + schedule_dict, + downloader_schedule, + _RATE_MULTIPLIER, + job_end_time, + logger) + + for batch_id in range(schedule_dict["partition_id_from"], schedule_dict["partition_id_to"]): + window.Lock(schedule_dict["main_rank"], MPI.LOCK_EXCLUSIVE) + try: + if not is_enough_time(rate_limit, job_end_time=job_end_time): + raise TimeoutError("Not enough time to download batch") + + logger.info(f"Rank {rank} started downloading batch {batch_id} of {schedule_dict['server_name']}") + + t0 = time.perf_counter() + + input_path = f"{server_urls_batched}/server_name={schedule_dict['server_name']}/partition_id={batch_id}" # TODO: Make "ServerName" and "partition_id" changeable column from config + output_path = f"{server_downloader_output}/server_name={schedule_dict['server_name']}/partition_id={batch_id}" + completed_batch, finish_rate = download_batch(downloader, input_path, batch_id) + rate_limit.change_rate(finish_rate) + + downloading_time += time.perf_counter() - t0 + + logger.info(f"Rank {rank} finished downloading batch {batch_id} of {schedule_dict['server_name']}") + except Exception as e: + window.Unlock(schedule_dict["main_rank"]) + raise e + else: + window.Unlock(schedule_dict["main_rank"]) + + t0 = time.perf_counter() + DirectWriter.write_batch(completed_batch, output_path, job_end_time, logger=logger) + logger.info(f"Rank {rank} finished writing batch {batch_id} of {schedule_dict['server_name']}") + + writing_time += time.perf_counter() - t0 + + logger.info(f"Rank {rank} spent {downloading_time} seconds downloading and {writing_time} seconds writing") + except Exception as e: + logger.error(f"Rank {rank} failed with error: {e}") + finally: + # comm.Barrier() + window.Free() + mem.release() + + +def main(): + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Server downloader') + parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") + _args = parser.parse_args() + + download_schedule( + config, + _args.schedule_path, + logger + ) + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/core/MPI_multimedia_downloader_controller.py b/src/distributed_downloader/core/MPI_multimedia_downloader_controller.py new file mode 100644 index 0000000..3979586 --- /dev/null +++ b/src/distributed_downloader/core/MPI_multimedia_downloader_controller.py @@ -0,0 +1,136 @@ +import argparse +import os +from collections import deque +from logging import Logger +from typing import Any, Dict, List, Deque + +import pandas as pd + +from distributed_downloader.core.mpi_downloader.utils import get_latest_schedule, generate_ids_to_download, \ + separate_to_blocks, \ + get_largest_nonempty_bucket, get_schedule_count +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.utils import init_logger + + +def create_new_schedule(config: Config, + server_schedule: str, + logger: Logger) -> None: + logger.info(f"Creating new schedule for {server_schedule}") + + number_of_workers: int = (config["downloader_parameters"]["max_nodes"] + * config["downloader_parameters"]["workers_per_node"]) + server_profiler_path = config.get_folder("profiles_table") + + server_profiler_df = pd.read_csv(server_profiler_path) + server_config_df = pd.read_csv(f"{server_schedule}/_config.csv") + server_verifier_df = pd.read_csv(f"{server_schedule}/_verification.csv") + + if os.path.exists(f"{server_schedule}/_DONE"): + logger.info(f"Schedule {server_schedule} already done") + return + + server_config_df["start_index"] = 0 + server_config_df["end_index"] = 0 + server_config_columns = server_config_df.columns.to_list() + server_config_df = server_config_df.merge(server_profiler_df, + on="server_name", + how="left", + validate="1:1", + suffixes=("", "_y")) + server_config_df["end_index"] = server_config_df["total_batches"] - 1 + server_config_df = server_config_df[server_config_columns] + + latest_schedule = get_latest_schedule(server_schedule) + if latest_schedule is not None and len(latest_schedule) > 0: + latest_schedule_aggr = latest_schedule.groupby("server_name").agg( + {"partition_id_from": "min", "partition_id_to": "max"}).reset_index() + server_config_df = server_config_df.merge(latest_schedule_aggr, on="server_name", how="left") + server_config_df = server_config_df.fillna(0) + server_config_df["start_index"] = server_config_df["partition_id_from"].astype(int) + server_config_df = server_config_df[server_config_columns] + + batches_to_download: pd.DataFrame = server_config_df.apply(generate_ids_to_download, axis=1, + args=(server_verifier_df,)) + batches_to_download = batches_to_download.merge(server_config_df, on="server_name", how="left").drop( + columns=["start_index", "end_index"]) + batches_to_download["batches"] = batches_to_download.apply(separate_to_blocks, axis=1) + + batches_to_download.sort_values(by=["process_per_node", "nodes"], inplace=True, ascending=False) + + ids_to_schedule_in_buckets: Dict[int, Deque[Dict[str, Any]]] = {} + process_per_nodes = batches_to_download["process_per_node"].unique() + for process_per_node in process_per_nodes: + ids_to_schedule_in_buckets[process_per_node] = deque( + batches_to_download[batches_to_download["process_per_node"] == process_per_node].to_dict("records")) + + logger.info("Filtered out already downloaded batches, creating schedule...") + logger.debug(ids_to_schedule_in_buckets) + + schedule_list: List[Dict[str, Any]] = [] + worker_id = 0 + + while len(ids_to_schedule_in_buckets) != 0: + worker_id = worker_id % number_of_workers + largest_key = get_largest_nonempty_bucket( + ids_to_schedule_in_buckets, + number_of_workers - worker_id + ) + + if largest_key == 0: + worker_id = 0 + continue + + current_server = ids_to_schedule_in_buckets[largest_key].popleft() + current_server["nodes"] -= 1 + server_rate_limit = server_profiler_df[server_profiler_df["server_name"] == current_server["server_name"]][ + "rate_limit"].array[0] + + if len(current_server["batches"]) > 0: + batches_to_schedule = [current_server["batches"].pop(0) for _ in range(current_server["process_per_node"])] + main_worker_id = worker_id + for batches in batches_to_schedule: + for batch in batches: + schedule_list.append({ + "rank": worker_id, + "server_name": current_server["server_name"], + "partition_id_from": batch[0], + "partition_id_to": batch[1], + "main_rank": main_worker_id, + "rate_limit": server_rate_limit, + }) + worker_id += 1 + + if current_server["nodes"] > 0: + ids_to_schedule_in_buckets[largest_key].append(current_server) + + if len(ids_to_schedule_in_buckets[largest_key]) == 0: + del ids_to_schedule_in_buckets[largest_key] + + schedule_number = get_schedule_count(server_schedule) + pd.DataFrame(schedule_list).to_csv(f"{server_schedule}/{schedule_number:0=4}.csv", index=False, header=True) + + logger.info(f"Schedule created for {server_schedule}") + + +def main(): + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Server downloader controller') + parser.add_argument("schedule_path", metavar="schedule_path", type=str, help="the path to the schedule") + _args = parser.parse_args() + + create_new_schedule( + config, + _args.schedule_path, + logger + ) + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/core/__about__.py b/src/distributed_downloader/core/__about__.py new file mode 100644 index 0000000..89e9150 --- /dev/null +++ b/src/distributed_downloader/core/__about__.py @@ -0,0 +1 @@ +__version__ = "0.1.0-alpha" diff --git a/src/distributed_downloader/core/__init__.py b/src/distributed_downloader/core/__init__.py new file mode 100644 index 0000000..f17fbf8 --- /dev/null +++ b/src/distributed_downloader/core/__init__.py @@ -0,0 +1,24 @@ +from logging import Logger +from typing import Optional + +from .main import DistributedDownloader + + +def download_images(config_path: str, logger: Optional[Logger] = None) -> None: + """ + Initiates the download of images based on a given configuration. + + This function creates an instance of `DistributedDownloader` using a configuration file path, + optionally sets a logger for the downloader, and then starts the image downloading process. + + Parameters: + - config_path (str): The file path to the configuration file required to initialize the downloader. + - logger (Logger, optional): An instance of a logger to be used by the downloader. Defaults to None. + + Returns: + - None + """ + dd = DistributedDownloader.from_path(config_path) + if logger is not None: + dd.logger = logger + dd.download_images() diff --git a/src/distributed_downloader/core/fake_profiler.py b/src/distributed_downloader/core/fake_profiler.py new file mode 100644 index 0000000..e299ded --- /dev/null +++ b/src/distributed_downloader/core/fake_profiler.py @@ -0,0 +1,38 @@ +import os + +import pandas as pd + +from distributed_downloader.core.mpi_downloader.dataclasses import profile_dtype +from distributed_downloader.tools.config import Config + + +def main(): + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + + # Get parameters from config + _DEFAULT_RATE_LIMIT: int = config["downloader_parameters"]["default_rate_limit"] + server_urls_batched: str = config.get_folder("urls_folder") + server_profiler_csv: str = config.get_folder("profiles_table") + + # Perform profiling + server_list = os.listdir(server_urls_batched) + + profile_csv = [] + for i, server in enumerate(server_list): + if not os.path.isdir(f"{server_urls_batched}/{server}"): + continue + + server_name = server.split("=")[1] + server_total_partitions = len(os.listdir(f"{server_urls_batched}/{server}")) + profile_csv.append([server_name, server_total_partitions, 0, 0, _DEFAULT_RATE_LIMIT]) + + profiles_df = pd.DataFrame(profile_csv, columns=profile_dtype.names) + profiles_df.to_csv(server_profiler_csv, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/core/initialization.py b/src/distributed_downloader/core/initialization.py new file mode 100644 index 0000000..8e7de9b --- /dev/null +++ b/src/distributed_downloader/core/initialization.py @@ -0,0 +1,101 @@ +import os.path +import uuid +from typing import Dict +from urllib.parse import urlparse + +import pyspark.sql.functions as func +from pyspark.sql import SparkSession, Window +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType + +from distributed_downloader.core.schemes import multimedia_scheme +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.utils import load_dataframe, truncate_paths, init_logger + + +@udf(returnType=StringType()) +def get_server_name(url: str): + return urlparse(url).netloc + + +@udf(returnType=StringType()) +def get_uuid(): + return str(uuid.uuid4()) + + +def init_filestructure(file_structure: Dict[str, str]) -> None: + filtered_fs = [value for key, value in file_structure.items() if key not in ["inner_checkpoint_file", "ignored_table"]] + truncate_paths(filtered_fs) + + +if __name__ == "__main__": + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "downloader") + + # Initialize parameters + input_path = config["path_to_input"] + # init_filestructure(config) + output_path = config.get_folder("urls_folder") + logger = init_logger(__name__) + + # Initialize SparkSession + spark = SparkSession.builder.appName("Multimedia prep").getOrCreate() + spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED") + spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED") + + multimedia_df = load_dataframe(spark, input_path, multimedia_scheme.schema) + + multimedia_df_prep = (multimedia_df + .filter((multimedia_df["gbifID"].isNotNull()) + & (multimedia_df["identifier"].isNotNull()) + & ( + (multimedia_df["type"] == "StillImage") + | ( + (multimedia_df["type"].isNull()) + & (multimedia_df["format"].contains("image")) + ) + )) + .repartition(20)) + + multimedia_df_prep = multimedia_df_prep.withColumn("server_name", + get_server_name(multimedia_df_prep.identifier)) + multimedia_df_prep = multimedia_df_prep.withColumn("UUID", get_uuid()) + + columns = multimedia_df_prep.columns + + logger.info("Starting batching") + + servers_grouped = (multimedia_df_prep + .select("server_name") + .groupBy("server_name") + .count() + .withColumn("batch_count", + func.floor(func.col("count") / config["downloader_parameters"]["batch_size"]))) + + window_part = Window.partitionBy("server_name").orderBy("server_name") + master_df_filtered = (multimedia_df_prep + .withColumn("row_number", func.row_number().over(window_part)) + .join(servers_grouped, ["server_name"]) + .withColumn("partition_id", func.col("row_number") % func.col("batch_count")) + .withColumn("partition_id", + (func + .when(func.col("partition_id").isNull(), 0) + .otherwise(func.col("partition_id")))) + .select(*columns, "partition_id")) + + logger.info("Writing to parquet") + + (master_df_filtered + .repartition("server_name", "partition_id") + .write + .partitionBy("server_name", "partition_id") + .mode("overwrite") + .format("parquet") + .save(output_path)) + + logger.info("Finished batching") + + spark.stop() diff --git a/src/distributed_downloader/core/main.py b/src/distributed_downloader/core/main.py new file mode 100644 index 0000000..4f48be5 --- /dev/null +++ b/src/distributed_downloader/core/main.py @@ -0,0 +1,186 @@ +import argparse +import csv +import os.path +from logging import Logger + +from distributed_downloader.tools.checkpoint import Checkpoint +from distributed_downloader.tools.config import Config + +from typing import Optional, Dict + +try: + from typing import LiteralString +except ImportError: + from typing_extensions import LiteralString + +from attr import define, field, Factory + +from distributed_downloader.core.initialization import init_filestructure +from distributed_downloader.tools.utils import submit_job, preprocess_dep_ids, init_logger + + +@define +class DistributedDownloader: + config: Config + + logger: Logger = field(default=Factory(lambda: init_logger(__name__))) + + urls_path: str = None + inner_checkpoint_path: str = None + profiles_path: str = None + schedules_folder: str = None + + inner_checkpoint: Checkpoint = None + _checkpoint_override: Optional[Dict[str, bool]] = None + default_checkpoint_structure = { + "batched": False, + "profiled": False, + "schedule_creation_scheduled": False, + } + + @classmethod + def from_path(cls, path: str, + checkpoint_override: Optional[Dict[str, bool]] = None) -> "DistributedDownloader": + return cls(config=Config.from_path(path, "downloader"), + checkpoint_override=checkpoint_override) + + def __attrs_post_init__(self): + self.urls_path = self.config.get_folder("urls_folder") + self.inner_checkpoint_path = self.config.get_folder("inner_checkpoint_file") + self.profiles_path = self.config.get_folder("profiles_table") + self.schedules_folder = os.path.join(self.config.get_folder("schedules_folder"), "current") + + self.inner_checkpoint = Checkpoint.from_path(self.inner_checkpoint_path, self.default_checkpoint_structure) + if self._checkpoint_override is not None: + for key, value in self._checkpoint_override.items(): + if key not in self.default_checkpoint_structure.keys(): + raise KeyError("Unknown key for override in checkpoint") + + self.inner_checkpoint[key] = value + + def __init_environment(self) -> None: + os.environ["CONFIG_PATH"] = self.config.config_path + + os.environ["ACCOUNT"] = self.config["account"] + os.environ["PATH_TO_INPUT"] = self.config["path_to_input"] + + os.environ["PATH_TO_OUTPUT"] = self.config["path_to_output_folder"] + for output_folder, output_path in self.config.folder_structure.items(): + os.environ["OUTPUT_" + output_folder.upper()] = output_path + + for downloader_var, downloader_value in self.config["downloader_parameters"].items(): + os.environ["DOWNLOADER_" + downloader_var.upper()] = str(downloader_value) + + self.logger.info("Environment initialized") + + def __schedule_initialization(self) -> int: + self.logger.info("Scheduling initialization script") + + init_filestructure(self.config.folder_structure) + + idx = submit_job(self.config.get_script("general_submitter"), + self.config.get_script("initialization_script")) + + self.logger.info(f"Submitted initialization script {idx}") + self.inner_checkpoint["batched"] = True + return idx + + def __schedule_profiling(self, prev_job_id: int = None) -> int: + self.logger.info("Scheduling profiling script") + idx = submit_job(self.config.get_script("general_submitter"), + self.config.get_script("profiling_script"), + *preprocess_dep_ids([prev_job_id])) + self.logger.info(f"Submitted profiling script {idx}") + self.inner_checkpoint["profiled"] = True + return idx + + def __schedule_downloading(self, prev_job_id: int = None) -> None: + self.logger.info("Scheduling downloading scripts") + + if self.__check_downloading(): + self.logger.info("All images already downloaded") + return + + all_prev_ids = [prev_job_id] + + if os.path.exists(self.schedules_folder): + for schedule in os.listdir(self.schedules_folder): + if not os.path.exists(os.path.join(self.schedules_folder, schedule, "_jobs_ids.csv")): + continue + with open(os.path.join(self.schedules_folder, schedule, "_jobs_ids.csv"), "r") as file: + all_prev_ids.append(int(list(csv.DictReader(file))[-1]["job_id"])) + + schedule_creation_id = submit_job(self.config.get_script("schedule_creator_submitter"), + self.config.get_script("schedule_creation_script"), + *preprocess_dep_ids(all_prev_ids)) + self.logger.info(f"Submitted schedule creation script {schedule_creation_id}") + self.inner_checkpoint["schedule_creation_scheduled"] = True + + def __check_downloading(self) -> bool: + if not os.path.exists(self.schedules_folder): + return False + + done = True + for schedule in os.listdir(self.schedules_folder): + schedule_path = os.path.join(self.schedules_folder, schedule) + if os.path.exists(f"{schedule_path}/_UNCHANGED"): + self.logger.warning(f"Schedule {schedule} is unchanged") + if not self.config.get("suppress_unchanged_error", False): + raise ValueError(f"Schedule {schedule} is unchanged, which can lead to infinite loop") + done = done and os.path.exists(f"{schedule_path}/_DONE") + + return done + + def download_images(self) -> None: + self.__init_environment() + + initialization_job_id = None + if ( + not os.path.exists(self.urls_path) + or not self.inner_checkpoint["batched"] + ): + initialization_job_id = self.__schedule_initialization() + else: + self.logger.info("Skipping initialization script: already batched") + + profiling_job_id = None + if ( + not os.path.exists(self.profiles_path) + or not self.inner_checkpoint["profiled"] + ): + profiling_job_id = self.__schedule_profiling(initialization_job_id) + else: + self.logger.info("Skipping profiling script: already profiled") + + if not self.inner_checkpoint["schedule_creation_scheduled"]: + self.__schedule_downloading(profiling_job_id) + else: + self.logger.error("Schedule creation already scheduled") + + +def main() -> None: + parser = argparse.ArgumentParser(description='Distributed downloader') + parser.add_argument("config_path", metavar="config_path", type=str, + help="the name of the tool that is intended to be used") + parser.add_argument("--reset_batched", action="store_true", help="Will reset filtering and scheduling steps") + parser.add_argument("--reset_profiled", action="store_true", help="Will reset scheduling step") + _args = parser.parse_args() + + config_path = _args.config_path + state_override = None + if _args.reset_filtering: + state_override = { + "batched": False, + "profiled": False + } + elif _args.reset_scheduling: + state_override = { + "profiled": False + } + + dd = DistributedDownloader.from_path(config_path, state_override) + dd.download_images() + + +if __name__ == "__main__": + main() diff --git a/src/mpi_downloader/DirectWriter.py b/src/distributed_downloader/core/mpi_downloader/DirectWriter.py similarity index 73% rename from src/mpi_downloader/DirectWriter.py rename to src/distributed_downloader/core/mpi_downloader/DirectWriter.py index 5f850a3..b03beb6 100644 --- a/src/mpi_downloader/DirectWriter.py +++ b/src/distributed_downloader/core/mpi_downloader/DirectWriter.py @@ -5,8 +5,7 @@ import pandas as pd -from mpi_downloader import CompletedBatch -from mpi_downloader.dataclasses import error_entry, success_entry, success_dtype, error_dtype +from .dataclasses import ErrorEntry, SuccessEntry, CompletedBatch def write_batch( @@ -29,11 +28,8 @@ def write_batch( try: for _ in range(completed_batch.success_queue.qsize()): - # if job_end_time - time.time() < 0: - # raise TimeoutError("Not enough time") - success = completed_batch.success_queue.get() - success_entity = success_entry.to_list_download(success) + success_entity = SuccessEntry.to_list_download(success) successes_list.append(success_entity) completed_batch.success_queue.task_done() @@ -41,7 +37,7 @@ def write_batch( for _ in range(completed_batch.error_queue.qsize()): error = completed_batch.error_queue.get() - error_entity = error_entry.to_list_download(error) + error_entity = ErrorEntry.to_list_download(error) errors_list.append(error_entity) completed_batch.error_queue.task_done() @@ -49,9 +45,10 @@ def write_batch( logger.info(f"Completed collecting entries for {output_path}") - pd.DataFrame(successes_list, columns=success_dtype(720).names).to_parquet(f"{output_path}/successes.parquet", + pd.DataFrame(successes_list, columns=SuccessEntry.get_names()).to_parquet(f"{output_path}/successes.parquet", index=False) - pd.DataFrame(errors_list, columns=error_dtype.names).to_parquet(f"{output_path}/errors.parquet", index=False) + pd.DataFrame(errors_list, columns=ErrorEntry.get_names()).to_parquet(f"{output_path}/errors.parquet", + index=False) logger.info(f"Completed writing to {output_path}") diff --git a/src/mpi_downloader/Downloader.py b/src/distributed_downloader/core/mpi_downloader/Downloader.py similarity index 99% rename from src/mpi_downloader/Downloader.py rename to src/distributed_downloader/core/mpi_downloader/Downloader.py index 28dcc1e..6bc06ad 100644 --- a/src/mpi_downloader/Downloader.py +++ b/src/distributed_downloader/core/mpi_downloader/Downloader.py @@ -10,7 +10,7 @@ import numpy as np import requests -from mpi_downloader.dataclasses import DownloadedImage, CompletedBatch, RateLimit +from .dataclasses import DownloadedImage, CompletedBatch, RateLimit _MAX_RETRIES = 5 _TIMEOUT = 5 diff --git a/src/mpi_downloader/PreLoader.py b/src/distributed_downloader/core/mpi_downloader/PreLoader.py similarity index 81% rename from src/mpi_downloader/PreLoader.py rename to src/distributed_downloader/core/mpi_downloader/PreLoader.py index 94ad528..7c635c4 100644 --- a/src/mpi_downloader/PreLoader.py +++ b/src/distributed_downloader/core/mpi_downloader/PreLoader.py @@ -11,7 +11,7 @@ def load_batch( ) -> Iterator[List[Dict[str, Any]]]: for batch_id in batches_to_download: server_df = pd.read_parquet( - f"{path_to_parquet}/ServerName={re.sub(':', '%3A', server_name)}/partition_id={batch_id}") + f"{path_to_parquet}/ServerName={server_name.replace(':', '%3A')}/partition_id={batch_id}") yield server_df.to_dict("records") diff --git a/src/__init__.py b/src/distributed_downloader/core/mpi_downloader/__init__.py similarity index 100% rename from src/__init__.py rename to src/distributed_downloader/core/mpi_downloader/__init__.py diff --git a/src/mpi_downloader/dataclasses.py b/src/distributed_downloader/core/mpi_downloader/dataclasses.py similarity index 70% rename from src/mpi_downloader/dataclasses.py rename to src/distributed_downloader/core/mpi_downloader/dataclasses.py index 30b37c0..ef0c159 100644 --- a/src/mpi_downloader/dataclasses.py +++ b/src/distributed_downloader/core/mpi_downloader/dataclasses.py @@ -21,7 +21,7 @@ class DownloadedImage: error_msg: str unique_name: str - gbifID: int + source_id: int identifier: str is_license_full: bool license: str @@ -44,8 +44,8 @@ def from_row(cls, row: Dict[str, Any]) -> DownloadedImage: retry_count=0, error_code=0, error_msg="", - unique_name=row.get("UUID", uuid.uuid4().hex), - gbifID=row.get("gbifID", 0), + unique_name=row.get("uuid", uuid.uuid4().hex), + source_id=row.get("source_id", 0), identifier=row.get("identifier", ""), is_license_full=all([row.get("license", None), row.get("source", None), row.get("title", None)]), license=row.get("license", _NOT_PROVIDED) or _NOT_PROVIDED, @@ -60,7 +60,7 @@ def init_downloaded_image_entry(image_entry: np.ndarray, row: Dict[str, Any]) -> image_entry["error_code"] = 0 image_entry["error_msg"] = "" image_entry["uuid"] = row.get("UUID", uuid.uuid4().hex) - image_entry["gbif_id"] = row.get("gbifID", 0) + image_entry["source_id"] = row.get("source_id", 0) image_entry["identifier"] = row.get("identifier", "") image_entry["is_license_full"] = all([row.get("license", None), row.get("source", None), row.get("title", None)]) image_entry["license"] = row.get("license", _NOT_PROVIDED) or _NOT_PROVIDED @@ -71,9 +71,9 @@ def init_downloaded_image_entry(image_entry: np.ndarray, row: Dict[str, Any]) -> @define -class success_entry: +class SuccessEntry: uuid: str - gbif_id: int + source_id: int identifier: str is_license_full: bool license: str @@ -85,13 +85,52 @@ class success_entry: resized_size: np.ndarray[np.uint32] image: bytes - # image: np.ndarray + def __success_dtype(self, img_size: int): + return np.dtype([ + ("uuid", "S32"), + ("source_id", "i4"), + ("identifier", "S256"), + ("is_license_full", "bool"), + ("license", "S256"), + ("source", "S256"), + ("title", "S256"), + ("original_size", "(2,)u4"), + ("resized_size", "(2,)u4"), + ("hashsum_original", "S32"), + ("hashsum_resized", "S32"), + ("image", f"({img_size},{img_size},3)uint8") + ]) + + @staticmethod + def get_success_spark_scheme(): + from pyspark.sql.types import StructType + from pyspark.sql.types import StringType + from pyspark.sql.types import LongType + from pyspark.sql.types import StructField + from pyspark.sql.types import BooleanType + from pyspark.sql.types import ArrayType + from pyspark.sql.types import BinaryType + + return StructType([ + StructField("uuid", StringType(), False), + StructField("source_id", LongType(), False), + StructField("identifier", StringType(), False), + StructField("is_license_full", BooleanType(), False), + StructField("license", StringType(), True), + StructField("source", StringType(), True), + StructField("title", StringType(), True), + StructField("original_size", ArrayType(LongType(), False), False), + StructField("resized_size", ArrayType(LongType(), False), False), + StructField("hashsum_original", StringType(), False), + StructField("hashsum_resized", StringType(), False), + StructField("image", BinaryType(), False) + ]) @classmethod - def from_downloaded(cls, downloaded: DownloadedImage) -> success_entry: + def from_downloaded(cls, downloaded: DownloadedImage) -> SuccessEntry: return cls( uuid=downloaded.unique_name, - gbif_id=downloaded.gbifID, + source_id=downloaded.source_id, identifier=downloaded.identifier, is_license_full=downloaded.is_license_full, license=downloaded.license, @@ -108,7 +147,7 @@ def from_downloaded(cls, downloaded: DownloadedImage) -> success_entry: def to_list_download(downloaded: DownloadedImage) -> List: return [ downloaded.unique_name, - downloaded.gbifID, + downloaded.source_id, downloaded.identifier, downloaded.is_license_full, downloaded.license, @@ -121,10 +160,27 @@ def to_list_download(downloaded: DownloadedImage) -> List: downloaded.image ] + @staticmethod + def get_names() -> List[str]: + return [ + "uuid", + "source_id", + "identifier", + "is_license_full", + "license", + "source", + "title", + "original_size", + "resized_size", + "hashsum_original", + "hashsum_resized", + "image" + ] + def to_list(self) -> List: return [ self.uuid, - self.gbif_id, + self.source_id, self.identifier, self.is_license_full, self.license, @@ -141,7 +197,7 @@ def to_np(self) -> np.ndarray: np_structure = np.array( [ (self.uuid, - self.gbif_id, + self.source_id, self.identifier, self.is_license_full, self.license, @@ -153,21 +209,29 @@ def to_np(self) -> np.ndarray: self.hashsum_resized, self.image) ], - dtype=success_dtype(np.max(self.resized_size))) + dtype=self.__success_dtype(np.max(self.resized_size))) return np_structure @define -class error_entry: +class ErrorEntry: uuid: str identifier: str retry_count: int error_code: int error_msg: str + _error_dtype = np.dtype([ + ("uuid", "S32"), + ("identifier", "S256"), + ("retry_count", "i4"), + ("error_code", "i4"), + ("error_msg", "S256") + ]) + @classmethod - def from_downloaded(cls, downloaded: DownloadedImage) -> error_entry: + def from_downloaded(cls, downloaded: DownloadedImage) -> ErrorEntry: return cls( uuid=downloaded.unique_name, identifier=downloaded.identifier, @@ -204,10 +268,20 @@ def to_np(self) -> np.ndarray: self.error_code, self.error_msg) ], - dtype=error_dtype) + dtype=self._error_dtype) return np_structure + @staticmethod + def get_names() -> List[str]: + return [ + "uuid", + "identifier", + "retry_count", + "error_code", + "error_msg" + ] + @define class ImageBatchesByServerToRequest: @@ -272,48 +346,6 @@ def change_rate(self, new_rate: float): self.upper_bound = self.initial_rate * (1 + self._multiplier) -success_dtype = lambda img_size: np.dtype([ - ("uuid", "S32"), - ("gbif_id", "i4"), - ("identifier", "S256"), - ("is_license_full", "bool"), - ("license", "S256"), - ("source", "S256"), - ("title", "S256"), - ("original_size", "(2,)u4"), - ("resized_size", "(2,)u4"), - ("hashsum_original", "S32"), - ("hashsum_resized", "S32"), - ("image", f"({img_size},{img_size},3)uint8") -]) - -error_dtype = np.dtype([ - ("uuid", "S32"), - ("identifier", "S256"), - ("retry_count", "i4"), - ("error_code", "i4"), - ("error_msg", "S256") -]) - -download_dtype = lambda img_size: np.dtype([ - ("is_downloaded", "bool"), - ("retry_count", "i4"), - ("error_code", "i4"), - ("error_msg", "S256"), - ("uuid", "S32"), - ("gbif_id", "i4"), - ("identifier", "S256"), - ("is_license_full", "bool"), - ("license", "S256"), - ("source", "S256"), - ("title", "S256"), - ("original_size", "(2,)u4"), - ("resized_size", "(2,)u4"), - ("hashsum_original", "S32"), - ("hashsum_resized", "S32"), - ("image", f"({img_size},{img_size},3)uint8") -]) - profile_dtype = np.dtype([ ("server_name", "S256"), ("total_batches", "i4"), diff --git a/src/mpi_downloader/utils.py b/src/distributed_downloader/core/mpi_downloader/utils.py similarity index 53% rename from src/mpi_downloader/utils.py rename to src/distributed_downloader/core/mpi_downloader/utils.py index 5a7368c..b7cc3a8 100644 --- a/src/mpi_downloader/utils.py +++ b/src/distributed_downloader/core/mpi_downloader/utils.py @@ -2,7 +2,6 @@ import logging import os -import re import shutil import time from typing import Dict, Tuple, Union, List, Any, Deque, Set @@ -12,8 +11,8 @@ from requests.adapters import HTTPAdapter from urllib3 import Retry -from mpi_downloader.Downloader import Downloader -from mpi_downloader.dataclasses import RateLimit +from .Downloader import Downloader +from .dataclasses import RateLimit def create_new_session(url: str, max_rate: int) -> requests.Session: @@ -26,17 +25,6 @@ def create_new_session(url: str, max_rate: int) -> requests.Session: return session -def truncate_folder(path: str): - shutil.rmtree(path, ignore_errors=True) - os.makedirs(path, exist_ok=True) - - -def truncate_server_folders(path: str) -> None: - for file in os.listdir(path): - if os.path.isdir(f"{path}/{file}") and "ServerName" in file: - shutil.rmtree(f"{path}/{file}", ignore_errors=True) - - def get_latest_schedule(path_to_dir: str, rank: int = None) -> Union[pd.DataFrame, None]: if not os.path.exists(path_to_dir) or not os.path.isdir(path_to_dir): return None @@ -51,7 +39,7 @@ def get_latest_schedule(path_to_dir: str, rank: int = None) -> Union[pd.DataFram latest_schedule_df = pd.read_csv(f"{path_to_dir}/{latest_schedule_file}") if rank is not None: - return latest_schedule_df[latest_schedule_df["Rank"] == rank] + return latest_schedule_df[latest_schedule_df["rank"] == rank] return latest_schedule_df @@ -62,81 +50,38 @@ def get_or_init_downloader(header: dict, rate_multiplier: float, job_end_time: int, logger: logging.Logger) -> Tuple[Downloader, requests.Session, RateLimit]: - if schedule_dict["ServerName"] not in downloader_schedule.keys(): - server_name = schedule_dict["ServerName"].replace("%3A", ":") - rate_limit = RateLimit(schedule_dict["RateLimit"], rate_multiplier) + if schedule_dict["server_name"] not in downloader_schedule.keys(): + server_name = schedule_dict["server_name"].replace("%3A", ":") + rate_limit = RateLimit(schedule_dict["rate_limit"], rate_multiplier) session = create_new_session(server_name, rate_limit.upper_bound) downloader = Downloader(header, session, rate_limit, img_size, job_end_time=job_end_time, logger=logger) - downloader_schedule[schedule_dict["ServerName"]] = (downloader, session, rate_limit) + downloader_schedule[schedule_dict["server_name"]] = (downloader, session, rate_limit) - downloader, session, rate_limit = downloader_schedule[schedule_dict["ServerName"]] + downloader, session, rate_limit = downloader_schedule[schedule_dict["server_name"]] return downloader, session, rate_limit def generate_ids_to_download(schedule_row: pd.Series, verifier_df: pd.DataFrame) -> pd.Series: - server_name = schedule_row["ServerName"] - server_start_idx = schedule_row["StartIndex"] - server_end_idx = schedule_row["EndIndex"] + server_name = schedule_row["server_name"] + server_start_idx = schedule_row["start_index"] + server_end_idx = schedule_row["end_index"] server_batches: Set[int] = set(range(server_start_idx, server_end_idx + 1)) - # max_batch_idx = 0 verifier_df = verifier_df[ - (verifier_df["ServerName"] == server_name) & (verifier_df["PartitionId"] >= server_start_idx) & ( - verifier_df["PartitionId"] <= server_end_idx)] - verifier_set = set(verifier_df["PartitionId"]) + (verifier_df["server_name"] == server_name) & (verifier_df["partition_id"] >= server_start_idx) & ( + verifier_df["partition_id"] <= server_end_idx)] + verifier_set = set(verifier_df["partition_id"]) server_batches = server_batches - verifier_set # server_batches.extend(range(max_batch_idx, server_end_idx + 1)) - return pd.Series([server_name, list(server_batches)], index=["ServerName", "Batches"]) - - -def verify_batches_for_prep(schedule_row: pd.DataFrame, input_path: str) -> pd.DataFrame: - schedule_row["ServerName"] = schedule_row["server_name"] - schedule_row["StartIndex"] = 0 - schedule_row["EndIndex"] = schedule_row["total_batches"] - - verification_df = pd.DataFrame(columns=["ServerName", "PartitionId", "Status"]) - - for idx, row in schedule_row.iterrows(): - new_verification_df = verify_downloaded_batches(row, input_path) - verification_df = pd.concat([verification_df, pd.DataFrame(new_verification_df)], - ignore_index=True).drop_duplicates() - - return verification_df - - -def verify_downloaded_batches(schedule_row: pd.Series, input_path: str) -> List[Dict[str, Any]]: - server_name = schedule_row["ServerName"] - server_start_idx = schedule_row["StartIndex"] - server_end_idx = schedule_row["EndIndex"] - verified_batches: List[Dict[str, Any]] = [] - - if os.path.exists(f"{input_path}/ServerName={server_name}"): - server_batches_names = os.listdir(f"{input_path}/ServerName={server_name}") - for batch_name in server_batches_names: - if not os.path.isdir(f"{input_path}/ServerName={server_name}/{batch_name}"): - continue - - batch_idx = int(batch_name.split("=")[1]) - if server_start_idx > batch_idx or server_end_idx < batch_idx: - continue - - if os.path.exists(f"{input_path}/ServerName={server_name}/{batch_name}/completed"): - verified_batches.append({"ServerName": server_name, "PartitionId": batch_idx, "Status": "Completed"}) - elif os.path.exists(f"{input_path}/ServerName={server_name}/{batch_name}/failed"): - verified_batches.append({"ServerName": server_name, "PartitionId": batch_idx, "Status": "Failed"}) - - return verified_batches - # if len(verified_batches) == 0: - # return pd.DataFrame(columns=["ServerName", "PartitionId", "Status"]) - # return pd.DataFrame(verified_batches) + return pd.Series([server_name, list(server_batches)], index=["server_name", "batches"]) def separate_to_blocks(data_row: pd.Series) -> List[List[Tuple[int, int]]]: - batches: List[int] = data_row["Batches"] - num_of_blocks: int = data_row["ProcessPerNode"] * data_row["Nodes"] + batches: List[int] = data_row["batches"] + num_of_blocks: int = data_row["process_per_node"] * data_row["nodes"] blocks: List[List[Tuple[int, int]]] = [] if len(batches) < 1: @@ -183,11 +128,9 @@ def get_largest_nonempty_bucket(buckets: Dict[int, Deque[Dict[str, Any]]], avail return largest_bucket -def is_enough_time(rate_limit: RateLimit, batch_size: int = 10000, avg_write_time: int = 600, job_end_time: int = int(os.getenv("SLURM_JOB_END_TIME", 0))) -> bool: +def is_enough_time(rate_limit: RateLimit, batch_size: int = 10000, avg_write_time: int = 600, + job_end_time: int = int(os.getenv("SLURM_JOB_END_TIME", 0))) -> bool: current_time = time.time() - - # print(f"{current_time}|{job_end_time}") - time_left = job_end_time - current_time - avg_write_time return rate_limit.initial_rate * time_left >= batch_size diff --git a/src/schemes/__init__.py b/src/distributed_downloader/core/schemes/__init__.py similarity index 100% rename from src/schemes/__init__.py rename to src/distributed_downloader/core/schemes/__init__.py diff --git a/src/schemes/multimedia_scheme.py b/src/distributed_downloader/core/schemes/multimedia_scheme.py similarity index 100% rename from src/schemes/multimedia_scheme.py rename to src/distributed_downloader/core/schemes/multimedia_scheme.py diff --git a/src/distributed_downloader/core/utils.py b/src/distributed_downloader/core/utils.py new file mode 100644 index 0000000..284c7ee --- /dev/null +++ b/src/distributed_downloader/core/utils.py @@ -0,0 +1,108 @@ +import os +from collections import deque +from typing import List, Deque, Dict, Any + +try: + from typing import LiteralString +except ImportError: + from typing_extensions import LiteralString + +import pandas as pd + + +def verify_downloaded_batches(schedule_row: pd.Series, input_path: str) -> List[Dict[str, Any]]: + server_name = schedule_row["server_name"] + server_start_idx = schedule_row["start_index"] + server_end_idx = schedule_row["end_index"] + verified_batches: List[Dict[str, Any]] = [] + + if os.path.exists( + f"{input_path}/server_name={server_name}"): # TODO: Make "server_name" changeable column from config + server_batches_names = os.listdir(f"{input_path}/server_name={server_name}") + for batch_name in server_batches_names: + if not os.path.isdir(f"{input_path}/server_name={server_name}/{batch_name}"): + continue + + batch_idx = int(batch_name.split("=")[1]) + if server_start_idx > batch_idx or server_end_idx < batch_idx: + continue + + if os.path.exists(f"{input_path}/server_name={server_name}/{batch_name}/completed"): + verified_batches.append({"server_name": server_name, "partition_id": batch_idx, "status": "Completed"}) + elif os.path.exists(f"{input_path}/server_name={server_name}/{batch_name}/failed"): + verified_batches.append({"server_name": server_name, "partition_id": batch_idx, "status": "Failed"}) + + return verified_batches + + +def verify_batches_for_prep(schedule_row: pd.DataFrame, input_path: str) -> pd.DataFrame: + schedule_row["start_index"] = 0 + schedule_row["end_index"] = schedule_row["total_batches"] + + verification_df = pd.DataFrame(columns=["server_name", "partition_id", "status"]) + + for idx, row in schedule_row.iterrows(): + new_verification_df = verify_downloaded_batches(row, input_path) + verification_df = pd.concat([verification_df, pd.DataFrame(new_verification_df)], + ignore_index=True).drop_duplicates() + + return verification_df + + +def split_dataframe(df: pd.DataFrame, by_column: str = "nodes", chunk_size=20) -> List[pd.DataFrame]: + chunks: List[pd.DataFrame] = [] + + row_list = df.to_dict("records") + + if len(row_list) == 0: + raise ValueError("Empty list") + + chunks.append(pd.DataFrame(row_list[0], index=[0])) + del row_list[0] + + while len(row_list) > 0: + i = 0 + + chunk = chunks[-1] + + while len(row_list) > 0 and i < len(row_list): + new_chunk = row_list[i] + column_value = chunk[by_column].sum() + new_chunk[by_column] + + if column_value <= chunk_size: + chunks[-1] = pd.concat([chunk, pd.DataFrame(new_chunk, index=[0])], ignore_index=True) + del row_list[i] + break + + i += 1 + else: + if len(row_list) == 0: + break + + chunks.append(pd.DataFrame(row_list[0], index=[0])) + del row_list[0] + + return chunks + + +def create_schedule_configs(group: pd.DataFrame, number_of_workers: int, schedule_path: str, + by_column: str = "nodes") -> None: + group = group.sort_values(by=[by_column], ascending=False).reset_index() + + chunked_group: Deque[pd.DataFrame] = deque(split_dataframe(group, by_column, number_of_workers)) + all_schedules = [int(folder) for folder in os.listdir(schedule_path) if os.path.isdir(f"{schedule_path}/{folder}")] + number_of_schedules = 0 + if len(all_schedules) > 0: + number_of_schedules: int = sorted(all_schedules, reverse=True)[0] + 1 + + while len(chunked_group) > 0: + chunk = chunked_group.popleft() + + while len(chunked_group) > 0 and chunk["total_batches"].sum() < number_of_workers * 50: + chunk = pd.concat([chunk, chunked_group.popleft()], ignore_index=True) + + chunk_folder = f"{schedule_path}/{number_of_schedules:0=4}" + os.mkdir(chunk_folder) + chunk.to_csv(f"{chunk_folder}/_config.csv", index=False, header=True) + + number_of_schedules += 1 diff --git a/src/distributed_downloader/tools/__init__.py b/src/distributed_downloader/tools/__init__.py new file mode 100644 index 0000000..6d21e4c --- /dev/null +++ b/src/distributed_downloader/tools/__init__.py @@ -0,0 +1,40 @@ +from logging import Logger +from typing import Optional + +from .checkpoint import Checkpoint +from .config import Config +from .filters import * +from .main import Tools +from .registry import ToolsRegistryBase +from .runners import * +from .schedulers import * +from .utils import ( + ensure_created, + get_id, + init_logger, + load_dataframe, + preprocess_dep_ids, + submit_job, + truncate_paths, +) + + +def apply_tools(config_path: str, tool_name: str, logger: Optional[Logger] = None) -> None: + """ + Applies a tool to the images downloaded by the DistributedDownloader. + + This function creates an instance of `DistributedDownloader` using a configuration file path, + optionally sets a logger for the downloader, and then applies the specified tool to the downloaded images. + + Parameters: + - config_path (str): The file path to the configuration file required to initialize the downloader. + - tool_name (str): The name of the tool to be applied to the downloaded images. + - logger (Logger, optional): An instance of a logger to be used by the downloader. Defaults to None. + + Returns: + - None + """ + dd_tools = Tools.from_path(config_path, tool_name) + if logger is not None: + dd_tools.logger = logger + dd_tools.apply_tool() diff --git a/src/distributed_downloader/tools/checkpoint.py b/src/distributed_downloader/tools/checkpoint.py new file mode 100644 index 0000000..799d241 --- /dev/null +++ b/src/distributed_downloader/tools/checkpoint.py @@ -0,0 +1,40 @@ +from typing import Dict + +import yaml +from attr import define + + +@define +class Checkpoint: + inner_checkpoint: Dict[str, bool] + inner_checkpoint_path: str + + @classmethod + def from_path(cls, path: str, default_structure: Dict[str, bool]) -> "Checkpoint": + return cls(inner_checkpoint_path=path, inner_checkpoint=cls.__load_checkpoint(path, default_structure)) + + @staticmethod + def __load_checkpoint(path: str, default_structure: Dict[str, bool]) -> Dict[str, bool]: + try: + with open(path, "r") as file: + checkpoint = yaml.full_load(file) + for key, value in default_structure.items(): + if key not in checkpoint: + checkpoint[key] = value + return checkpoint + except FileNotFoundError: + return default_structure + + def __save_checkpoint(self) -> None: + with open(self.inner_checkpoint_path, "w") as file: + yaml.dump(self.inner_checkpoint, file) + + def __getitem__(self, item): + return self.inner_checkpoint[item] + + def get(self, item, default=None): + return self.inner_checkpoint.get(item, default) + + def __setitem__(self, key, value): + self.inner_checkpoint[key] = value + self.__save_checkpoint() diff --git a/src/distributed_downloader/tools/config.py b/src/distributed_downloader/tools/config.py new file mode 100644 index 0000000..1716f99 --- /dev/null +++ b/src/distributed_downloader/tools/config.py @@ -0,0 +1,77 @@ +import os.path +from pathlib import Path +from typing import Any, Dict, Literal + +import yaml +from attr import define + +TEMPLATE_PATH = { + "downloader": "config_templates/downloader.yaml", + "tools": "config_templates/tools.yaml" +} + + +def is_subset(template: Dict, config: Dict) -> bool: + is_subset_result = set(template.keys()).issubset(config.keys()) + for key, value in template.items(): + if isinstance(value, dict): + is_subset_result &= is_subset(value, config[key]) + return is_subset_result + + +@define +class Config: + config_path: str + _cfg_dict: Dict[str, str | int | bool | Dict[str, Any]] + scripts: Dict[str, str] + folder_structure: Dict[str, str] + + @classmethod + def from_path(cls, path: str, config_type: Literal["downloader", "tools"]) -> "Config": + cfg = cls.__load_config(path) + + assert cls.__check_config(cfg, config_type), "Config is not valid" + + # print(cfg) + return cls(config_path=path, + cfg_dict=cfg, + scripts=cls.__load_scripts(cfg), + folder_structure=cls.__load_folder_structure(cfg)) + + @staticmethod + def __check_config(cfg: Dict[str, str | int | bool | Dict[str, Any]], + config_type: Literal["downloader", "tools"]) -> bool: + config_template_path = os.path.join(Path(__file__).parent.absolute(), TEMPLATE_PATH[config_type]) + if not os.path.exists(config_template_path): + raise FileNotFoundError(f"Config template not found, can't check it {config_template_path}") + with open(config_template_path, "r") as f: + config_template = yaml.full_load(f) + return is_subset(config_template, cfg) + + @staticmethod + def __load_config(path: str) -> Dict[str, str | int | bool | Dict[str, Any]]: + with open(path, "r") as file: + return yaml.full_load(file) + + @staticmethod + def __load_scripts(cfg: Dict[str, str | int | bool | Dict[str, Any]]) -> Dict[str, str]: + return cfg["scripts"] + + @staticmethod + def __load_folder_structure(cfg: Dict[str, str | int | bool | Dict[str, Any]]) -> Dict[str, str]: + file_structure: Dict[str, str] = {"path_to_output_folder": cfg["path_to_output_folder"]} + file_structure.update( + {key: os.path.join(cfg["path_to_output_folder"], value) for key, value in cfg["output_structure"].items()}) + return file_structure + + def get_script(self, script_name: str) -> str: + return self.scripts[script_name] + + def get_folder(self, folder_name: str) -> str: + return self.folder_structure[folder_name] + + def get(self, key: str, default: Any = None) -> str | int | bool | Dict[str, Any]: + return self._cfg_dict.get(key, default) + + def __getitem__(self, key: str) -> str | int | bool | Dict[str, Any]: + return self._cfg_dict[key] diff --git a/src/distributed_downloader/tools/config_templates/downloader.yaml b/src/distributed_downloader/tools/config_templates/downloader.yaml new file mode 100644 index 0000000..2473371 --- /dev/null +++ b/src/distributed_downloader/tools/config_templates/downloader.yaml @@ -0,0 +1,50 @@ +account: "" +path_to_input: "" +path_to_output_folder: "" + +scripts: + # Wrapper scripts to submit jobs to the cluster + general_submitter: "" + mpi_submitter: "" + schedule_creator_submitter: "" + # Cluster job's scripts + initialization_script: "" + profiling_script: "" + schedule_creation_script: "" + verify_script: "" + download_script: "" + +# Rules for the schedule creation +# They determine how many simultaneous downloader instances can be run on the same server +# Rules are based on the number of batches required to be downloaded from the server +# Rule is: key - number of batches, value - number of instances; if server has more than key batches, value instances can be run +# Server with 0 batches is considered to be downloaded and are ignored +# Default value is 1 +# Order of the rules does not matter +schedule_rules: + 1: 1 + +# Structure of the output folder that will be created automatically +output_structure: + urls_folder: "" + logs_folder: "" + images_folder: "" + schedules_folder: "" + profiles_table: "" + ignored_table: "" + inner_checkpoint_file: "" + tools_folder: "" + +# Parameters for the downloader +suppress_unchanged_error: False +downloader_parameters: + num_downloads: 0 + max_nodes: 0 + workers_per_node: 0 + cpu_per_worker: 0 + header: "" + image_size: 0 + logger_level: "" + batch_size: 0 + rate_multiplier: 0 + default_rate_limit: 0 diff --git a/src/distributed_downloader/tools/config_templates/tools.yaml b/src/distributed_downloader/tools/config_templates/tools.yaml new file mode 100644 index 0000000..47b486b --- /dev/null +++ b/src/distributed_downloader/tools/config_templates/tools.yaml @@ -0,0 +1,41 @@ +account: "" +path_to_input: "" +path_to_output_folder: "" + +scripts: + # Wrapper scripts to submit jobs to the cluster + tools_submitter: "" + # tools scripts + tools_filter_script: "" + tools_scheduling_script: "" + tools_worker_script: "" + tools_verification_script: "" + +# Rules for the schedule creation +# They determine how many simultaneous downloader instances can be run on the same server +# Rules are based on the number of batches required to be downloaded from the server +# Rule is: key - number of batches, value - number of instances; if server has more than key batches, value instances can be run +# Server with 0 batches is considered to be downloaded and are ignored +# Default value is 1 +# Order of the rules does not matter +schedule_rules: + 1: 1 + +# Structure of the output folder that will be created automatically +output_structure: + urls_folder: "" + logs_folder: "" + images_folder: "" + schedules_folder: "" + profiles_table: "" + ignored_table: "" + inner_checkpoint_file: "" + tools_folder: "" + +tools_parameters: + num_workers: 0 + max_nodes: 0 + workers_per_node: 0 + cpu_per_worker: 0 + threshold_size: 0 + new_resize_size: 0 diff --git a/src/distributed_downloader/tools/filter.py b/src/distributed_downloader/tools/filter.py new file mode 100644 index 0000000..2e6fc2c --- /dev/null +++ b/src/distributed_downloader/tools/filter.py @@ -0,0 +1,31 @@ +import argparse +import os + +from distributed_downloader.tools.utils import init_logger +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsRegistryBase + +if __name__ == "__main__": + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "tools") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Filtering step of the Tool') + parser.add_argument("filter_name", metavar="filter_name", type=str, + help="the name of the tool that is intended to be used") + _args = parser.parse_args() + tool_name = _args.filter_name + + assert tool_name in ToolsRegistryBase.TOOLS_REGISTRY.keys(), ValueError("unknown filter") + + tool_filter = ToolsRegistryBase.TOOLS_REGISTRY[tool_name]["filter"](config) + + logger.info("Starting filter") + tool_filter.run() + + logger.info("completed filtering") + + tool_filter = None diff --git a/src/distributed_downloader/tools/filters.py b/src/distributed_downloader/tools/filters.py new file mode 100644 index 0000000..7fd86b5 --- /dev/null +++ b/src/distributed_downloader/tools/filters.py @@ -0,0 +1,180 @@ +import os.path +from functools import partial + +import pandas as pd +import pyspark.sql as ps +import pyspark.sql.functions as func +from pyspark.sql import SparkSession + +from distributed_downloader.core.mpi_downloader.dataclasses import SuccessEntry +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsBase +from distributed_downloader.tools.registry import ToolsRegistryBase + +FilterRegister = partial(ToolsRegistryBase.register, "filter") +__all__ = ["FilterRegister", + "SizeBasedFiltering", + "DuplicatesBasedFiltering", + "ResizeToolFilter", + "ImageVerificationToolFilter"] + + +class FilterToolBase(ToolsBase): + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_family = "filter" + + +class SparkFilterToolBase(FilterToolBase): + success_scheme = SuccessEntry.get_success_spark_scheme() + + def __init__(self, cfg: Config, spark: SparkSession = None): + super().__init__(cfg) + self.spark: SparkSession = spark if spark is not None else SparkSession.builder.appName( + "Filtering").getOrCreate() + self.spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED") + self.spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED") + + def run(self): + raise NotImplementedError() + + def load_data_parquet(self): + return (self.spark + .read + .schema(self.success_scheme) + .option("basePath", self.downloaded_images_path) + .parquet(self.downloaded_images_path + "/server_name=*/partition_id=*/successes.parquet")) + + def save_filter(self, df: ps.DataFrame): + if self.filter_name is None: + raise ValueError("filter name was not defined") + (df + .repartition(10) + .write + .csv(os.path.join(self.tools_path, self.filter_name, "filter_table"), + header=True, + mode="overwrite")) + + def __del__(self): + if self.spark is not None: + self.spark.stop() + + +@FilterRegister("size_based") +class SizeBasedFiltering(SparkFilterToolBase): + + def __init__(self, cfg: Config, spark: SparkSession = None): + super().__init__(cfg, spark) + self.filter_name: str = "size_based" + + assert "threshold_size" in self.config["tools_parameters"], ( + ValueError("threshold_size have to be defined")) + assert isinstance(self.config["tools_parameters"]["threshold_size"], int), ( + ValueError("threshold_size have to be Integer")) + + self.threshold_size = self.config["tools_parameters"]["threshold_size"] + + def run(self): + successes_df: ps.DataFrame = self.load_data_parquet() + + successes_df = (successes_df + .withColumn("is_big", + func.array_min(func.col("original_size")) >= + self.threshold_size)) + + too_small_images = successes_df.filter(~successes_df["is_big"]).select("uuid", + "gbif_id", + "server_name", + "partition_id") + + self.save_filter(too_small_images) + + self.logger.info(f"Too small images number: {too_small_images.count()}") + + +@FilterRegister("duplication_based") +class DuplicatesBasedFiltering(SparkFilterToolBase): + + def __init__(self, cfg: Config, spark: SparkSession = None): + super().__init__(cfg, spark) + self.filter_name: str = "duplication_based" + + def run(self): + successes_df: ps.DataFrame = self.load_data_parquet() + + not_duplicate_records = (successes_df + .groupBy("hashsum_original") + .count() + .where('count = 1') + .drop('count')) + + duplicate_records = (successes_df + .join(not_duplicate_records, on="hashsum_original", how='left_anti') + .select("uuid", "gbif_id", "server_name", "partition_id", "hashsum_original")) + + window = ps.Window.partitionBy("hashsum_original").orderBy("partition_id", "server_name") + + duplicate_records_top = (duplicate_records + .withColumn("rn", func.row_number().over(window)) + .where("rn == 1") + .drop("rn")) + + duplicate_records_top = duplicate_records_top.withColumnsRenamed( + {"uuid": "uuid_main", + "gbif_id": "gbif_id_main", + "server_name": "server_name_main", + "partition_id": "partition_id_main"}) + + duplicate_records = (duplicate_records + .join(duplicate_records_top, on="hashsum_original", how="left") + .where("uuid != uuid_main") + .drop("hashsum_original") + ) + + self.save_filter(duplicate_records) + + self.logger.info(f"duplicated number: {duplicate_records.count()}") + + +class PythonFilterToolBase(FilterToolBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + def get_all_paths_to_merge(self) -> pd.DataFrame: + all_schedules = [] + path = self.downloaded_images_path + for folder in os.listdir(path): + server_name = folder.split("=")[1] + for partition in os.listdir(f"{path}/{folder}"): + partition_path = f"{path}/{folder}/{partition}" + if (not os.path.exists(f"{partition_path}/successes.parquet") or + not os.path.exists(f"{partition_path}/completed")): + continue + all_schedules.append([server_name, partition.split("=")[1]]) + return pd.DataFrame(all_schedules, columns=["server_name", "partition_id"]) + + def run(self): + filter_table = self.get_all_paths_to_merge() + + filter_table_folder = os.path.join(self.tools_path, self.filter_name, "filter_table") + os.makedirs(filter_table_folder, exist_ok=True) + + filter_table.to_csv(filter_table_folder + "/table.csv", header=True, index=False) + + +@FilterRegister("resize") +class ResizeToolFilter(PythonFilterToolBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + self.filter_name = "resize" + + +@FilterRegister("image_verification") +class ImageVerificationToolFilter(PythonFilterToolBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + self.filter_name = "image_verification" diff --git a/src/distributed_downloader/tools/main.py b/src/distributed_downloader/tools/main.py new file mode 100644 index 0000000..e23ed8b --- /dev/null +++ b/src/distributed_downloader/tools/main.py @@ -0,0 +1,216 @@ +import argparse +import os +from logging import Logger +from typing import Dict, List, Optional, TextIO, Tuple + +import pandas as pd +from attr import Factory, define, field + +from distributed_downloader.tools.checkpoint import Checkpoint +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsRegistryBase +from distributed_downloader.tools.utils import ( + ensure_created, + init_logger, + preprocess_dep_ids, + submit_job, + truncate_paths, +) + + +@define +class Tools: + config: Config + tool_name: str + + logger: Logger = field(default=Factory(lambda: init_logger(__name__))) + + tool_folder: str = None + tool_job_history_path: str = None + tool_checkpoint_path: str = None + checkpoint_scheme = { + "filtered": False, + "schedule_created": False, + "completed": False + } + + tool_checkpoint: Checkpoint = None + _checkpoint_override: Optional[Dict[str, bool]] = None + tool_job_history: List[int] = None + tool_job_history_io: TextIO = None + + @classmethod + def from_path(cls, path: str, + tool_name: str, + checkpoint_override: Optional[Dict[str, bool]] = None) -> "Tools": + if tool_name not in ToolsRegistryBase.TOOLS_REGISTRY.keys(): + raise ValueError("unknown tool name") + + return cls(config=Config.from_path(path, "tools"), + tool_name=tool_name, + checkpoint_override=checkpoint_override) + + def __attrs_post_init__(self): + # noinspection PyTypeChecker + self.tool_folder: str = os.path.join(self.config.get_folder("tools_folder"), + self.tool_name) + self.tool_job_history_path: str = os.path.join(self.tool_folder, "job_history.csv") + self.tool_checkpoint_path: str = os.path.join(self.tool_folder, "tool_checkpoint.yaml") + + self.__init_environment() + self.__init_filestructure() + + def __init_environment(self) -> None: + os.environ["CONFIG_PATH"] = self.config.config_path + + os.environ["ACCOUNT"] = self.config["account"] + os.environ["PATH_TO_INPUT"] = self.config["path_to_input"] + + os.environ["PATH_TO_OUTPUT"] = self.config["path_to_output_folder"] + for output_folder, output_path in self.config.folder_structure.items(): + os.environ["OUTPUT_" + output_folder.upper()] = output_path + os.environ["OUTPUT_TOOLS_LOGS_FOLDER"] = os.path.join(self.tool_folder, + "logs") + + for downloader_var, downloader_value in self.config["tools_parameters"].items(): + os.environ["TOOLS_" + downloader_var.upper()] = str(downloader_value) + + self.logger.info("Environment initialized") + + def __init_filestructure(self): + ensure_created([ + self.tool_folder, + os.path.join(self.tool_folder, "filter_table"), + os.path.join(self.tool_folder, "verification"), + os.path.join(self.tool_folder, "logs") + ]) + + self.tool_checkpoint = Checkpoint.from_path(self.tool_checkpoint_path, self.checkpoint_scheme) + if self._checkpoint_override is not None: + for key, value in self._checkpoint_override.items(): + if key == "verification": + truncate_paths([os.path.join(self.tool_folder, "verification")]) + continue + if key not in self.checkpoint_scheme.keys(): + raise KeyError("Unknown key for override in checkpoint") + + self.tool_checkpoint[key] = value + + self.tool_job_history, self.tool_job_history_io = self.__load_job_history() + + def __load_job_history(self) -> Tuple[List[int], TextIO]: + job_ids = [] + + if os.path.exists(self.tool_job_history_path): + df = pd.read_csv(self.tool_job_history_path) + job_ids = df["job_ids"].to_list() + else: + with open(self.tool_job_history_path, "w") as f: + print("job_ids", file=f) + + job_io = open(self.tool_job_history_path, "a") + + return job_ids, job_io + + def __update_job_history(self, new_id: int) -> None: + self.tool_job_history.append(new_id) + print(new_id, file=self.tool_job_history_io) + + def __schedule_filtering(self) -> None: + self.logger.info("Scheduling filtering script") + job_id = submit_job(self.config.get_script("tools_submitter"), + self.config.get_script("tools_filter_script"), + self.tool_name, + *preprocess_dep_ids( + [self.tool_job_history[-1] if len(self.tool_job_history) != 0 else None]), + "--spark") + self.__update_job_history(job_id) + self.tool_checkpoint["filtered"] = True + self.logger.info("Scheduled filtering script") + + def __schedule_schedule_creation(self) -> None: + self.logger.info("Scheduling schedule creation script") + job_id = submit_job(self.config.get_script("tools_submitter"), + self.config.get_script("tools_scheduling_script"), + self.tool_name, + *preprocess_dep_ids([self.tool_job_history[-1]])) + self.__update_job_history(job_id) + self.tool_checkpoint["schedule_created"] = True + self.logger.info("Scheduled schedule creation script") + + def __schedule_workers(self) -> None: + self.logger.info("Scheduling workers script") + + for _ in range(self.config["tools_parameters"]["num_workers"]): + job_id = submit_job(self.config.get_script("tools_submitter"), + self.config.get_script("tools_worker_script"), + self.tool_name, + *preprocess_dep_ids([self.tool_job_history[-1]])) + self.__update_job_history(job_id) + + job_id = submit_job(self.config.get_script("tools_submitter"), + self.config.get_script("tools_verification_script"), + self.tool_name, + *preprocess_dep_ids([self.tool_job_history[-1]])) + self.__update_job_history(job_id) + + self.logger.info("Scheduled workers script") + + def apply_tool(self): + if not self.tool_checkpoint.get("filtered", False): + self.__schedule_filtering() + else: + self.logger.info("Skipping filtering script: table already created") + + if not self.tool_checkpoint.get("schedule_created", False): + self.__schedule_schedule_creation() + else: + self.logger.info("Skipping schedule creation script: schedule already created") + + if not self.tool_checkpoint.get("completed", False): + self.__schedule_workers() + else: + self.logger.error("Tool completed its job") + + def __del__(self): + if self.tool_job_history_io is not None: + self.tool_job_history_io.close() + + +def main(): + parser = argparse.ArgumentParser(description='Tools') + parser.add_argument("config_path", metavar="config_path", type=str, + help="the name of the tool that is intended to be used") + parser.add_argument("tool_name", metavar="tool_name", type=str, + help="the name of the tool that is intended to be used") + parser.add_argument("--reset_filtering", action="store_true", help="Will reset filtering and scheduling steps") + parser.add_argument("--reset_scheduling", action="store_true", help="Will reset scheduling step") + parser.add_argument("--reset_runners", action="store_true", help="Will reset runners, making them to start over") + _args = parser.parse_args() + + config_path = _args.config_path + tool_name = _args.tool_name + state_override = None + if _args.reset_filtering: + state_override = { + "filtered": False, + "schedule_created": False, + "verification": False + } + elif _args.reset_scheduling: + state_override = { + "schedule_created": False + } + elif _args.reset_runners: + state_override = { + "verification": False + } + + dd = Tools.from_path(config_path, + tool_name, + state_override) + dd.apply_tool() + + +if __name__ == "__main__": + main() diff --git a/src/distributed_downloader/tools/registry.py b/src/distributed_downloader/tools/registry.py new file mode 100644 index 0000000..a4825f2 --- /dev/null +++ b/src/distributed_downloader/tools/registry.py @@ -0,0 +1,60 @@ +import os +from typing import Dict, Type + +from distributed_downloader.tools.utils import init_logger +from distributed_downloader.tools.config import Config + + +class ToolsRegistryBase(type): + TOOLS_REGISTRY: Dict[str, Dict[str, Type["ToolsBase"]]] = {} + + @classmethod + def get(cls, name): + return cls.TOOLS_REGISTRY.get(name.lower()) + + @classmethod + def register(cls, filter_family: str, filter_name: str): + def wrapper(model_cls): + assert issubclass(model_cls, ToolsBase) + assert (filter_name not in cls.TOOLS_REGISTRY.keys() + or filter_family not in cls.TOOLS_REGISTRY[filter_name].keys()), ( + ValueError(f"tool with the name {filter_name} already have family {filter_family}")) + + if filter_name not in cls.TOOLS_REGISTRY.keys(): + cls.TOOLS_REGISTRY[filter_name] = dict() + + cls.TOOLS_REGISTRY[filter_name][filter_family] = model_cls + return model_cls + return wrapper + + def __contains__(self, item): + return item in self.TOOLS_REGISTRY + + def __iter__(self): + return iter(self.TOOLS_REGISTRY) + + def __repr__(self): + return f"{self.__class__.__name__}({self.TOOLS_REGISTRY})" + + __str__ = __repr__ + + +class ToolsBase(metaclass=ToolsRegistryBase): + + # noinspection PyTypeChecker + def __init__(self, cfg: Config): + self.config = cfg + + self.filter_name: str = None + self.filter_family: str = None + + self.logger = init_logger(__name__) + + self.urls_path = self.config.get_folder("urls_folder") + self.downloaded_images_path = self.config.get_folder("images_folder") + self.tools_path = self.config.get_folder("tools_folder") + self.total_workers = (self.config["tools_parameters"]["max_nodes"] + * self.config["tools_parameters"]["workers_per_node"]) + + def run(self): + raise NotImplementedError() diff --git a/src/distributed_downloader/tools/runner.py b/src/distributed_downloader/tools/runner.py new file mode 100644 index 0000000..fb69c20 --- /dev/null +++ b/src/distributed_downloader/tools/runner.py @@ -0,0 +1,29 @@ +import argparse +import os + +from distributed_downloader.tools.utils import init_logger +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsRegistryBase + +if __name__ == "__main__": + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "tools") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Running step of the Tool') + parser.add_argument("runner_name", metavar="runner_name", type=str, + help="the name of the tool that is intended to be used") + _args = parser.parse_args() + tool_name = _args.runner_name + + assert tool_name in ToolsRegistryBase.TOOLS_REGISTRY.keys(), ValueError("unknown runner") + + tool_filter = ToolsRegistryBase.TOOLS_REGISTRY[tool_name]["runner"](config) + + logger.info("Starting runner") + tool_filter.run() + + logger.info("completed runner") diff --git a/src/distributed_downloader/tools/runners.py b/src/distributed_downloader/tools/runners.py new file mode 100644 index 0000000..baca458 --- /dev/null +++ b/src/distributed_downloader/tools/runners.py @@ -0,0 +1,369 @@ +import glob +import hashlib +import os +import time +from functools import partial +from typing import List, TextIO, Tuple + +import cv2 +import numpy as np +import pandas as pd +from PIL import UnidentifiedImageError, Image +import mpi4py.MPI as MPI + +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsBase, ToolsRegistryBase + +RunnerRegister = partial(ToolsRegistryBase.register, "runner") +__all__ = ["RunnerRegister", + "DuplicationFilterRunnerTool", + "FilterRunnerTool", + "ImageVerificationRunnerTool", + "SizeBasedFilterRunnerTool"] + + +class RunnerToolBase(ToolsBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_family = "runner" + + +class MPIRunnerTool(RunnerToolBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_folder: str = None + self.filter_table_folder: str = None + self.verification_folder: str = None + self.verification_IO: TextIO = None + + self.data_scheme: List[str] = None + self.verification_scheme: List[str] = None + + self.mpi_comm: MPI.Intracomm = MPI.COMM_WORLD + self.mpi_rank: int = self.mpi_comm.rank + self.total_time: int = None + + def is_enough_time(self): + assert self.total_time is not None, ValueError("total_time is not set") + if time.time() > int(os.getenv("SLURM_JOB_END_TIME", 0)) - self.total_time: + raise TimeoutError("Not enough time") + + @staticmethod + def load_table(folder: str, columns: List[str] = None) -> pd.DataFrame: + all_files = glob.glob(os.path.join(folder, "*.csv")) + if len(all_files) == 0: + assert columns is not None, ValueError("No files found and columns are not defined") + + return pd.DataFrame(columns=columns) + return pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True) + + @staticmethod + def get_csv_writer(path: str, scheme: List[str]) -> TextIO: + if not os.path.exists(path): + file = open(path, "w") + print(",".join(scheme), file=file, flush=True) + else: + file = open(path, "a") + return file + + def ensure_folders_created(self): + assert self.filter_name is not None, ValueError("filter name is not set") + assert self.verification_scheme is not None, ValueError("verification scheme is not set") + + self.filter_folder = os.path.join(self.tools_path, self.filter_name) + self.filter_table_folder = os.path.join(self.filter_folder, "filter_table") + self.verification_folder = os.path.join(self.tools_path, self.filter_name, "verification") + + os.makedirs(self.verification_folder, exist_ok=True) + + def get_schedule(self): + schedule_df = pd.read_csv(os.path.join(self.filter_folder, "schedule.csv")) + schedule_df = schedule_df.query(f"rank == {self.mpi_rank}") + verification_df = self.load_table(self.verification_folder, ["server_name", "partition_id"]) + outer_join = schedule_df.merge(verification_df, how='outer', indicator=True, on=["server_name", "partition_id"]) + return outer_join[(outer_join["_merge"] == 'left_only')].drop('_merge', axis=1) + + def get_remaining_table(self, schedule: pd.DataFrame) -> pd.api.typing.DataFrameGroupBy: + assert self.data_scheme is not None, ValueError("data scheme is not set") + + df = self.load_table(self.filter_table_folder) + df = df.merge(schedule, + how="right", + on=["server_name", "partition_id"]) + df = df[self.data_scheme] + + return df.groupby(["server_name", "partition_id"], group_keys=True) + + def apply_filter(self, filtering_df: pd.DataFrame, server_name: str, partition_id: str) -> int: + raise NotImplementedError() + + def runner_fn(self, df_local: pd.DataFrame) -> int: + filtering_df = df_local.reset_index(drop=True) + server_name = filtering_df.iloc[0]["server_name"] + partition_id = filtering_df.iloc[0]["partition_id"] + try: + filtered_parquet_length = self.apply_filter(filtering_df, server_name, partition_id) + except NotImplementedError: + raise NotImplementedError("Filter function wasn't implemented") + except Exception as e: + self.logger.exception(e) + self.logger.error(f"Error occurred: {e}") + return 0 + else: + print(f"{server_name},{partition_id}", end="\n", file=self.verification_IO) + self.logger.debug(f"Completed filtering: {server_name}/{partition_id} with {filtered_parquet_length}") + return 1 + + def run(self): + self.ensure_folders_created() + + schedule = self.get_schedule() + self.mpi_comm.Barrier() + if len(schedule) == 0: + self.logger.error(f"Schedule not found or empty for rank {self.mpi_rank}") + exit(0) + + self.verification_IO = self.get_csv_writer(f"{self.verification_folder}/{str(self.mpi_rank).zfill(4)}.csv", + self.verification_scheme) + + remaining_table = self.get_remaining_table(schedule) + + remaining_table.apply(self.runner_fn) + + def __del__(self): + if self.verification_IO is not None: + self.verification_IO.close() + + +class FilterRunnerTool(MPIRunnerTool): + + def __init__(self, cfg: Config): + super().__init__(cfg) + self.data_scheme: List[str] = ["uuid", "gbif_id", "server_name", "partition_id"] + self.verification_scheme: List[str] = ["server_name", "partition_id"] + self.total_time = 150 + + def apply_filter(self, filtering_df: pd.DataFrame, server_name: str, partition_id: str) -> int: + self.is_enough_time() + + parquet_path = os.path.join( + self.downloaded_images_path, + f"server_name={server_name}", + f"partition_id={partition_id}", + "successes.parquet" + ) + + if not os.path.exists(parquet_path): + self.logger.info(f"Path doesn't exists: {server_name}/{partition_id}") + return 0 + + filtered_parquet = pd.read_parquet(parquet_path, + filters=[("uuid", "not in", filtering_df["uuid"])] + ) + + self.is_enough_time() + + if len(filtered_parquet) == 0: + self.logger.info(f"Fully filtered out: {server_name}/{partition_id}") + + filtered_parquet.to_parquet(parquet_path, index=False, compression="zstd", compression_level=3) + + return len(filtered_parquet) + + +@RunnerRegister("duplication_based") +class DuplicationFilterRunnerTool(FilterRunnerTool): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name = "duplication_based" + + +@RunnerRegister("size_based") +class SizeBasedFilterRunnerTool(FilterRunnerTool): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "size_based" + + +@RunnerRegister("image_verification") +class ImageVerificationRunnerTool(MPIRunnerTool): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "image_verification" + + self.data_scheme: List[str] = ["server_name", "partition_id"] + self.verification_scheme: List[str] = ["server_name", "partition_id"] + self.corrupted_folder: str = None + self.corrupted_scheme: List[str] = ["uuid", "gbif_id", "server_name", "partition_id"] + self.corrupted_IO: TextIO = None + self.total_time = 150 + + def ensure_folders_created(self): + assert self.filter_name is not None, ValueError("filter name is not set") + assert self.verification_scheme is not None, ValueError("verification scheme is not set") + assert self.corrupted_scheme is not None, ValueError("corrupted scheme is not set") + + self.filter_folder = os.path.join(self.tools_path, self.filter_name) + self.filter_table_folder = os.path.join(self.filter_folder, "filter_table") + self.verification_folder = os.path.join(self.tools_path, self.filter_name, "verification") + self.corrupted_folder = os.path.join(self.tools_path, self.filter_name, "corrupted") + + os.makedirs(self.verification_folder, exist_ok=True) + os.makedirs(self.corrupted_folder, exist_ok=True) + + self.verification_IO = self.get_csv_writer(f"{self.verification_folder}/{str(self.mpi_rank).zfill(4)}.csv", + self.verification_scheme) + self.corrupted_IO = self.get_csv_writer(f"{self.corrupted_folder}/{str(self.mpi_rank).zfill(4)}.csv", + self.corrupted_scheme) + + def apply_filter(self, filtering_df: pd.DataFrame, server_name: str, partition_id: str) -> int: + self.is_enough_time() + + parquet_path = os.path.join( + self.downloaded_images_path, + f"server_name={server_name}", + f"partition_id={partition_id}", + "successes.parquet" + ) + + if not os.path.exists(parquet_path): + self.logger.info(f"Path doesn't exists: {server_name}/{partition_id}") + return 0 + + parquet_to_verify = pd.read_parquet(parquet_path) + parquet_to_verify_length = len(parquet_to_verify) + self.is_enough_time() + + if parquet_to_verify_length != 0: + verified_images = parquet_to_verify.apply(self.verify_image, axis=1) + verified_parquet = parquet_to_verify.merge(verified_images, on="uuid", how="left", validate="1:1") + corrupted_images: pd.DataFrame = verified_parquet.loc[~verified_parquet["is_verified"]] + if len(corrupted_images) != 0: + verified_parquet.loc[verified_parquet["is_verified"]].to_parquet(parquet_path, + index=False, + compression="zstd", + compression_level=3) + corrupted_images = corrupted_images[["uuid", "error"]] + corrupted_images["server_name"] = server_name + corrupted_images["partition_id"] = partition_id + corrupted_images.to_csv(self.corrupted_IO, mode="a", header=True, index=False) + + return parquet_to_verify_length - len(corrupted_images) + return parquet_to_verify_length + + @staticmethod + def verify_image(row: pd.Series) -> pd.Series: + # Feed in expected_dimensions and known_checksum from successes.parquet + verified_image = pd.Series(data=(row["uuid"], True, ""), + index=("uuid", "is_verified", "error")) + try: + # Ensure no data-at-rest corruption from stray intergalactic cosmic rays ... + image_bytes_checksum = hashlib.md5(row["image"]).hexdigest() # Define elsewhere + if image_bytes_checksum != row["hashsum_resized"]: + raise ValueError("Checksum mismatch, image may be corrupted") + + # NumPy Array and Reshaping + img_array = np.frombuffer(row["image"], dtype=np.uint8).reshape( + (row["resized_size"][0], row["resized_size"][1], 3)) + + # Convert BGR to RGB + img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) + + # Convert the NumPy array to a PIL Image + image = Image.fromarray(img_array) + + # Use PIL's verify method for a basic validation + image.verify() + + # Validate range of pixel values + if np.any(img_array > 255) or np.any(img_array < 0): + raise ValueError("Pixel values are out of range") + except (ValueError, UnidentifiedImageError, cv2.error) as e: + # print(f"Data integrity issue detected: {e}") + verified_image["is_verified"] = False + verified_image["error"] = str(e) + + return verified_image + + +@RunnerRegister("resize") +class ResizeRunnerTool(MPIRunnerTool): + + def __init__(self, cfg: Config): + super().__init__(cfg) + assert isinstance(self.config["tools_parameters"]["new_resize_size"], int), ( + ValueError("new size have to be Integer")) + + self.filter_name: str = "resize" + self.data_scheme: List[str] = ["server_name", "partition_id"] + self.verification_scheme: List[str] = ["server_name", "partition_id"] + self.total_time = 300 + self.new_size = self.config["tools_parameters"]["new_resize_size"] + + def apply_filter(self, filtering_df: pd.DataFrame, server_name: str, partition_id: str) -> int: + self.is_enough_time() + + parquet_path = os.path.join( + self.downloaded_images_path, + f"server_name={server_name}", + f"partition_id={partition_id}", + "successes.parquet" + ) + + if not os.path.exists(parquet_path): + self.logger.info(f"Path doesn't exists: {server_name}/{partition_id}") + return 0 + + parquet_to_resize = pd.read_parquet(parquet_path) + initial_scheme = parquet_to_resize.columns + + self.is_enough_time() + resized_parquet = parquet_to_resize.apply(self.resize_partition, axis=1) + + parquet_to_resize = parquet_to_resize.merge(resized_parquet, + on="uuid", + how="inner", + validate="1:1", + suffixes=("_x", "")) + parquet_to_resize = parquet_to_resize[initial_scheme] + + self.is_enough_time() + parquet_to_resize.to_parquet(parquet_path, index=False, compression="zstd", compression_level=3) + + def resize_partition(self, row: pd.Series) -> pd.Series: + image_shape: np.ndarray[int, np.dtype[np.int32]] = row["resized_size"] + image_original_np: np.ndarray = np.frombuffer(row["image"], dtype=np.uint8).reshape( + [image_shape[0], image_shape[1], 3]) + + if image_shape[0] > self.new_size or image_shape[1] > self.new_size: + image_original_np, image_shape = self.image_resize(image_original_np) + + image_original_np_bytes = image_original_np.tobytes() + new_check_sum = hashlib.md5(image_original_np_bytes).hexdigest() + + return pd.Series({"uuid": row["uuid"], + "resized_size": image_shape, + "hashsum_resized": new_check_sum, + "image": image_original_np_bytes}, + index=["uuid", "resized_size", "hashsum_resized", "image"]) + + def image_resize(self, image: np.ndarray) \ + -> Tuple[np.ndarray[int, np.dtype[np.uint8]], np.ndarray[int, np.dtype[np.uint32]]]: + h, w = image.shape[:2] + if h > w: + new_h = self.new_size + new_w = int(w * (new_h / h)) + else: + new_w = self.new_size + new_h = int(h * (new_w / w)) + return cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA), np.array([new_h, new_w]) diff --git a/src/distributed_downloader/tools/scheduler.py b/src/distributed_downloader/tools/scheduler.py new file mode 100644 index 0000000..285f070 --- /dev/null +++ b/src/distributed_downloader/tools/scheduler.py @@ -0,0 +1,29 @@ +import argparse +import os + +from distributed_downloader.tools.utils import init_logger +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsRegistryBase + +if __name__ == "__main__": + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "tools") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Running step of the Tool') + parser.add_argument("scheduler_name", metavar="scheduler_name", type=str, + help="the name of the tool that is intended to be used") + _args = parser.parse_args() + tool_name = _args.scheduler_name + + assert tool_name in ToolsRegistryBase.TOOLS_REGISTRY.keys(), ValueError("unknown scheduler") + + tool_filter = ToolsRegistryBase.TOOLS_REGISTRY[tool_name]["scheduler"](config) + + logger.info("Starting scheduler") + tool_filter.run() + + logger.info("completed scheduler") diff --git a/src/distributed_downloader/tools/schedulers.py b/src/distributed_downloader/tools/schedulers.py new file mode 100644 index 0000000..efe8736 --- /dev/null +++ b/src/distributed_downloader/tools/schedulers.py @@ -0,0 +1,79 @@ +import glob +import os +from functools import partial + +import pandas as pd + +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsBase, ToolsRegistryBase + +SchedulerRegister = partial(ToolsRegistryBase.register, "scheduler") +__all__ = ["SchedulerRegister", + "SizeBasedScheduler", + "DuplicatesBasedScheduler", + "ResizeToolScheduler", + "ImageVerificationBasedScheduler"] + + +class SchedulerToolBase(ToolsBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_family = "scheduler" + + +class DefaultScheduler(SchedulerToolBase): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + def run(self): + assert self.filter_name is not None, ValueError("filter name is not set") + + filter_folder = os.path.join(self.tools_path, self.filter_name) + filter_table_folder = os.path.join(filter_folder, "filter_table") + + all_files = glob.glob(os.path.join(filter_table_folder, "*.csv")) + df: pd.DataFrame = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True) + df = df[["server_name", "partition_id"]] + df = df.drop_duplicates(subset=["server_name", "partition_id"]).reset_index(drop=True) + df["rank"] = df.index % self.total_workers + + df.to_csv(os.path.join(filter_folder, "schedule.csv"), header=True, index=False) + + +@SchedulerRegister("size_based") +class SizeBasedScheduler(DefaultScheduler): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "size_based" + + +@SchedulerRegister("duplication_based") +class DuplicatesBasedScheduler(DefaultScheduler): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "duplication_based" + + +@SchedulerRegister("resize") +class ResizeToolScheduler(DefaultScheduler): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "resize" + + +@SchedulerRegister("image_verification") +class ImageVerificationBasedScheduler(DefaultScheduler): + + def __init__(self, cfg: Config): + super().__init__(cfg) + + self.filter_name: str = "image_verification" diff --git a/src/distributed_downloader/tools/utils.py b/src/distributed_downloader/tools/utils.py new file mode 100644 index 0000000..4598328 --- /dev/null +++ b/src/distributed_downloader/tools/utils.py @@ -0,0 +1,84 @@ +import logging +import os +import shutil +import subprocess +from typing import List, Sequence, Optional + +from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.types import StructType + + +def load_dataframe(spark: SparkSession, input_path: str, scheme: Optional[StructType | str] = None) -> DataFrame: + file_extension = input_path.split('.')[-1].lower() + + def infer_delimiter(_first_line): + if '\t' in _first_line: + return '\t' + elif ',' in _first_line: + return ',' + elif ' ' in _first_line: + return ' ' + elif '|' in _first_line: + return '|' + elif ';' in _first_line: + return ';' + else: + return None + + if file_extension in ['csv', 'tsv', 'txt']: + if file_extension == 'csv': + sep = ',' + elif file_extension == 'tsv': + sep = '\t' + elif file_extension == 'txt': + with open(input_path, 'r') as file: + first_line = file.readline() + sep = infer_delimiter(first_line) + if sep is None: + raise ValueError(f"Could not infer delimiter for file {input_path}") + df = spark.read.csv(input_path, sep=sep, header=True, schema=scheme) + else: + try: + df = spark.read.load(input_path, scheme=scheme) + except Exception as e: + raise FileNotFoundError(f"File not supported: {e}") + + return df + + +def ensure_created(list_of_path: List[str]) -> None: + for path in list_of_path: + os.makedirs(path, exist_ok=True) + + +def truncate_paths(paths: Sequence[str]) -> None: + for path in paths: + is_dir = "." not in path.split("/")[-1] + if is_dir: + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path) + else: + open(path, "w").close() + + +def get_id(output: bytes) -> int: + return int(output.decode().strip().split(" ")[-1]) + + +def init_logger(logger_name: str, output_path: str = None, logging_level: str = "INFO") -> logging.Logger: + logging.basicConfig( + filename=output_path, + level=logging.getLevelName(logging_level), + format="%(asctime)s - %(levelname)s - %(process)d - %(message)s") + return logging.getLogger(logger_name) + + +def submit_job(submitter_script: str, script: str, *args) -> int: + output = subprocess.check_output(f"{submitter_script} {script} {' '.join(args)}", shell=True) + idx = get_id(output) + return idx + + +def preprocess_dep_ids(ids: List[int | None]) -> List[str]: + return [str(_id) for _id in ids if _id is not None] diff --git a/src/distributed_downloader/tools/verification.py b/src/distributed_downloader/tools/verification.py new file mode 100644 index 0000000..f879369 --- /dev/null +++ b/src/distributed_downloader/tools/verification.py @@ -0,0 +1,42 @@ +import argparse +import os + +import pandas as pd + +from distributed_downloader.tools.checkpoint import Checkpoint +from distributed_downloader.tools.utils import init_logger +from distributed_downloader.tools.config import Config +from distributed_downloader.tools.registry import ToolsRegistryBase +from distributed_downloader.tools.runners import MPIRunnerTool + +if __name__ == "__main__": + config_path = os.environ.get("CONFIG_PATH") + if config_path is None: + raise ValueError("CONFIG_PATH not set") + + config = Config.from_path(config_path, "tools") + logger = init_logger(__name__) + + parser = argparse.ArgumentParser(description='Running step of the Tool') + parser.add_argument("runner_name", metavar="runner_name", type=str, + help="the name of the tool that is intended to be used") + _args = parser.parse_args() + tool_name = _args.runner_name + + assert tool_name in ToolsRegistryBase.TOOLS_REGISTRY.keys(), ValueError("unknown runner") + + tool_folder = os.path.join(config.get_folder("tools_folder"), tool_name) + checkpoint = Checkpoint.from_path(os.path.join(tool_folder, "tool_checkpoint.yaml"), {"completed": False}) + schedule_df = pd.read_csv(os.path.join(tool_folder, "schedule.csv")) + verification_df = MPIRunnerTool.load_table(os.path.join(tool_folder, "verification"), + ["server_name", "partition_id"]) + + outer_join = schedule_df.merge(verification_df, how='outer', indicator=True, on=["server_name", "partition_id"]) + left = outer_join[(outer_join["_merge"] == 'left_only')].drop('_merge', axis=1) + + if len(left) == 0: + checkpoint["completed"] = True + + logger.info("Tool completed its job") + else: + logger.info(f"Tool needs more time, left to complete: {len(left)}") diff --git a/src/mpi_downloader/ProfilerWriter.py b/src/mpi_downloader/ProfilerWriter.py deleted file mode 100644 index 5893ed3..0000000 --- a/src/mpi_downloader/ProfilerWriter.py +++ /dev/null @@ -1,76 +0,0 @@ -import cv2 -import h5py -import numpy as np - -from mpi_downloader import CompletedBatch -from mpi_downloader.dataclasses import error_entry, error_dtype, profile_dtype - -sample_length = 5 - - -def write_batch( - profiles_hdf: h5py.Dataset, - errors_hdf: h5py.Dataset, - completed_batch: CompletedBatch, - rate_limit: float, - rank: int, - offset: int, - batch_size: int, - server_name: str, - total_batches: int, - output_path: str -): - # os.makedirs(f"{output_path}/samples", exist_ok=True) - - successes_number = completed_batch.success_queue.qsize() - errors_number = completed_batch.error_queue.qsize() - - errors_list = [] - - for _ in range(errors_number): - error_download = completed_batch.error_queue.get() - errors_list.append(error_entry.from_downloaded(error_download).to_np()) - - for idx in range(successes_number): - success_download = completed_batch.success_queue.get() - try: - np_image = np.asarray(bytearray(success_download.image), dtype="uint8") - original_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) - - if original_image is None: - raise ValueError("Corrupted Image") - - resized_image = cv2.resize(original_image, (1024, 1024), interpolation=cv2.INTER_LINEAR) - - if idx < sample_length: - cv2.imwrite(f"{output_path}/samples/{server_name}_{idx}.jpg", resized_image) - - except Exception as e: - errors_list.append(error_entry( - uuid=success_download.unique_name, - identifier=success_download.identifier, - retry_count=0, - error_code=-3, - error_msg=str(e) - ).to_np()) - - errors_number += 1 - successes_number -= 1 - - print(f"Rank {rank} writing to HDF5 {successes_number} successes and {errors_number} errors") - - errors_np = np.array(errors_list, dtype=error_dtype).reshape((-1,)) - server_profile_np = np.array( - [ - ( - server_name, - total_batches, - successes_number, - errors_number, - rate_limit - ) - ], - dtype=profile_dtype) - - profiles_hdf[offset] = server_profile_np - errors_hdf[offset * batch_size:offset * batch_size + errors_number] = errors_np diff --git a/src/mpi_downloader/__init__.py b/src/mpi_downloader/__init__.py deleted file mode 100644 index 794e532..0000000 --- a/src/mpi_downloader/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from mpi_downloader.utils import create_new_session, truncate_folder -from mpi_downloader.dataclasses import DownloadedImage, ImageBatchesByServerToRequest, CompletedBatch, WriterServer -from mpi_downloader.Downloader import Downloader -from mpi_downloader.PreLoader import load_batch diff --git a/src/resize_mpi.py b/src/resize_mpi.py deleted file mode 100644 index c5e615f..0000000 --- a/src/resize_mpi.py +++ /dev/null @@ -1,169 +0,0 @@ -import hashlib -import os -import time -from typing import Tuple - -import mpi4py.MPI as MPI - -import cv2 -import numpy as np -import pandas as pd -import py7zr -from PIL import Image, UnidentifiedImageError -from py7zr import FILTER_ZSTD - -comm = MPI.COMM_WORLD -rank = comm.rank -output_path = "/users/PAS2119/andreykopanev/distributed-downloader/data/verification_stat" -schedule_path = "/users/PAS2119/andreykopanev/distributed-downloader/data/schedule_full.csv" -base_path = "/fs/scratch/PAS2136/gbif/processed/2024-05-01/multimedia_prep/downloaded_images" -_new_size = 720 -read_time = 150 -write_time = 150 -schedule_df = pd.read_csv(schedule_path) -schedule = schedule_df.query(f"Rank == {rank}").set_index("ServerName").to_dict("index") - -if len(schedule) == 0: - raise ValueError(f"Empty schedule for rank {rank}") - - -def read_parquets(base_path: str, filename: str) -> pd.DataFrame: - empty_df = pd.DataFrame() - - for folder, content in schedule.items(): - ids = str(content["Ids"]).split() - for _id in ids: - if not os.path.exists(f"{base_path}/ServerName={folder}/partition_id={_id}/{filename}"): - continue - - new_df = pd.read_parquet(f"{base_path}/ServerName={folder}/partition_id={_id}/{filename}") - new_df["ServerName"] = folder - new_df["partition_id"] = int(_id) - - empty_df = pd.concat([empty_df, new_df]).reset_index(drop=True) - - return empty_df - - -def image_resize(image: np.ndarray, - max_size=720) -> Tuple[np.ndarray[int, np.dtype[np.uint8]], np.ndarray[int, np.dtype[np.uint32]]]: - h, w = image.shape[:2] - if h > w: - new_h = max_size - new_w = int(w * (new_h / h)) - else: - new_w = max_size - new_h = int(h * (new_w / w)) - return cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA), np.array([new_h, new_w]) - - -def validate_image_data(img_bytes: bytes, - expected_dimensions: np.ndarray[int, np.dtype[np.int32]], - known_checksum: str = None) -> Tuple[bool, str]: - # Feed in expected_dimensions and known_checksum from successes.parquet - try: - # Ensure no data-at-rest corruption from stray intergalactic cosmic rays ... - if known_checksum: - image_bytes_checksum = hashlib.md5(img_bytes).hexdigest() # Define elsewhere - if image_bytes_checksum != known_checksum: - raise ValueError("Checksum mismatch, image may be corrupted") - - # NumPy Array and Reshaping - img_array = np.frombuffer(img_bytes, dtype=np.uint8).reshape( - (expected_dimensions[0], expected_dimensions[1], 3)) - - # Convert BGR to RGB - img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) - - # Convert the NumPy array to a PIL Image - image = Image.fromarray(img_array) - - # Use PIL's verify method for a basic validation - image.verify() - - # Validate range of pixel values - if np.any(img_array > 255) or np.any(img_array < 0): - raise ValueError("Pixel values are out of range") - - return True, "" - except (ValueError, UnidentifiedImageError, cv2.error) as e: - # print(f"Data integrity issue detected: {e}") - return False, str(e) - - -def resize_partition(partition: pd.DataFrame) -> pd.DataFrame: - server_name = partition['ServerName'].iloc[0] - partition_id = partition['partition_id'].iloc[0] - partition_path = f"{base_path}/ServerName={server_name}/partition_id={partition_id}" - print(f"Starting {server_name} {partition_id}") - - if not os.path.exists(partition_path): - return pd.DataFrame(columns=["uuid", "identifier", "ServerName", "verified", "verification_msg"]) - - if time.time() > int(os.getenv("SLURM_JOB_END_TIME", 0)) - (write_time + read_time): - print( - f"Not enough time to resize {int(os.getenv('SLURM_JOB_END_TIME', 0)) - time.time()} left, {write_time + read_time} needed") - return pd.DataFrame(columns=["uuid", "identifier", "ServerName", "verified", "verification_msg"]) - - partition_dict = partition.to_dict("index") - - verification_dict = {} - - try: - with py7zr.SevenZipFile(f"{partition_path}/images.7z", 'r', filters=[{'id': FILTER_ZSTD, 'level': 3}]) as f: - names = f.getnames() - for fname, bio in f.read(names).items(): - image_stream = bio.read() - - image_shape: np.ndarray[int, np.dtype[np.int32]] = partition_dict[fname]["resized_size"] - image_original_np: np.ndarray = np.frombuffer(image_stream, dtype=np.uint8).reshape( - [image_shape[0], image_shape[1], 3]) - - is_valid, error_msg = validate_image_data(image_stream, image_shape, - partition_dict[fname]["hashsum_resized"]) - verification_dict[fname] = { - "identifier": partition_dict[fname]["identifier"], - "ServerName": server_name, - "verified": is_valid, - "verification_msg": error_msg} - - if image_shape[0] > _new_size or image_shape[1] > _new_size: - image_original_np, image_shape = image_resize(image_original_np, _new_size) - - image_stream = image_original_np.tobytes() - partition_dict[fname]["hashsum_resized"] = hashlib.md5(image_stream).hexdigest() - partition_dict[fname]["resized_size"] = image_shape - partition_dict[fname]["image"] = image_stream - except Exception as e: - corrupted = open(f"{partition_path}/_corrupted.txt", "w") - print(str(e), file=corrupted) - corrupted.close() - print(f"Error: {server_name}: {e}", flush=True) - return pd.DataFrame(columns=["uuid", "identifier", "ServerName", "verified", "verification_msg"]) - else: - os.remove(f"{partition_path}/images.7z") - (pd.DataFrame - .from_dict(partition_dict, orient="index") - .reset_index(names="uuid") - .drop(columns=["ServerName", "partition_id"]) - .to_parquet(f"{partition_path}/successes.parquet", index=False, compression="zstd", compression_level=3)) - - verification_df = pd.DataFrame.from_dict(verification_dict, orient="index").reset_index(names="uuid") - - verification_df.drop(columns=["identifier", "ServerName"]).to_parquet(f"{partition_path}/verification.parquet", - index=False) - - return verification_df - - -if __name__ == "__main__": - successes_df = read_parquets(base_path, "successes.parquet").set_index("uuid") - successes_grouped = successes_df.groupby(["ServerName", "partition_id"]) - - (successes_grouped - .apply(resize_partition, include_groups=True) - .reset_index(drop=True).drop(columns=["uuid"]) - .groupby(["ServerName", "verified"]) - .count() - .reset_index(names=["ServerName", "verified"]) - .to_parquet(f"{output_path}/ver_{rank}.parquet", index=False)) diff --git a/src/resizer_scheduler.py b/src/resizer_scheduler.py deleted file mode 100644 index 952f24f..0000000 --- a/src/resizer_scheduler.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import re - -from pandas import DataFrame - -schedule_path = "/users/PAS2119/andreykopanev/distributed-downloader/data/schedule_full.csv" -base_path = "/fs/scratch/PAS2136/gbif/processed/2024-05-01/multimedia_prep/downloaded_images" -number_of_nodes = 10 -number_of_workers = 3 - -number_of_ranks = number_of_nodes * number_of_workers - - -def concat_ids(partition: DataFrame) -> DataFrame: - ids = partition["Id"].str.cat(sep=" ") - rank = int(partition["Rank"].iloc[0]) - server = partition["ServerName"].iloc[0] - result = DataFrame([[rank, server, ids]], columns=["Rank", "ServerName", "Ids"]) - return result - - -all_schedules = [] -corrupted_count = 0 -not_that_corrupted = 0 - -for folder in os.listdir(base_path): - server_name = folder.split("=")[1] - for partition in os.listdir(f"{base_path}/{folder}"): - partition_path = f"{base_path}/{folder}/{partition}" - if os.path.exists(f"{partition_path}/_corrupted.txt"): - with open(f"{partition_path}/_corrupted.txt", "r") as f: - corrupted_text = f.read() - if len(re.findall("\(.*,.*,.*\)", corrupted_text)) == 0: - corrupted_count += 1 - print(f"{partition_path}: {corrupted_text}") - continue - else: - # os.remove(f"{partition_path}/_corrupted.txt") - not_that_corrupted += 1 - if (not os.path.exists(f"{partition_path}/images.7z") or - not os.path.exists(f"{partition_path}/verification.parquet") or - not os.path.exists(f"{partition_path}/successes.parquet") or - not os.path.exists(f"{partition_path}/completed")): - continue - all_schedules.append([server_name, partition.split("=")[1]]) - -schedule_df = DataFrame(all_schedules, columns=["ServerName", "Id"]) -print(schedule_df.count()) -print(corrupted_count) -print(not_that_corrupted) -schedule_df["Rank"] = schedule_df.index % number_of_ranks -schedule_grouped = schedule_df.groupby(["Rank", "ServerName"]) - -schedules = schedule_grouped.apply(concat_ids).reset_index(drop=True) - -schedules.to_csv(schedule_path, index=False, header=True) diff --git a/src/server_prep.py b/src/server_prep.py deleted file mode 100644 index 142dc90..0000000 --- a/src/server_prep.py +++ /dev/null @@ -1,103 +0,0 @@ -import argparse -import os.path -import uuid -from urllib.parse import urlparse - -import pyspark.sql.functions as F -from pyspark.sql import SparkSession, Window -from pyspark.sql.functions import udf -from pyspark.sql.types import StringType - -from schemes import multimedia_scheme -from utils.utils import truncate_folder - -BATCH_SIZE = 10_000 - - -@udf(returnType=StringType()) -def get_server_name(url: str): - return urlparse(url).netloc - - -@udf(returnType=StringType()) -def get_uuid(): - return str(uuid.uuid4()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert multimedia data to server batches') - - parser.add_argument('input_path', metavar='input_path', type=str, help='the path to the file with multimedia data, must be a tab-delimited text file') - parser.add_argument('output_path', metavar='output_path', type=str, help='the path to the output folder (folder for download components (e.g., server batches and image folder))') - - # parse the arguments - args = parser.parse_args() - input_path: str = args.input_path - output_path: str = args.output_path - servers_batched_folder: str = os.getenv("DOWNLOADER_URLS_FOLDER", "servers_batched") - - # Initialize SparkSession - spark = SparkSession.builder.appName("Multimedia prep").getOrCreate() - spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED") - spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED") - - truncate_folder(output_path) - - if os.path.isfile(input_path): - multimedia_df = spark.read.csv( - input_path, - sep="\t", - header=True, - schema=multimedia_scheme.schema - ) - else: - multimedia_df = spark.read.load( - input_path - ) - - multimedia_df_prep = (multimedia_df - .filter((multimedia_df["gbifID"].isNotNull()) - & (multimedia_df["identifier"].isNotNull()) - & ( - (multimedia_df["type"] == "StillImage") - | ( - (multimedia_df["type"].isNull()) - & (multimedia_df["format"].contains("image")) - ) - )) - .repartition(20)) - - multimedia_df_prep = multimedia_df_prep.withColumn("ServerName", - get_server_name(multimedia_df_prep.identifier)) - multimedia_df_prep = multimedia_df_prep.withColumn("UUID", get_uuid()) - - columns = multimedia_df_prep.columns - - servers_batched_dir = os.path.join(output_path, servers_batched_folder) - os.makedirs(servers_batched_dir, exist_ok=True) - - print("Starting batching") - - servers_grouped = (multimedia_df_prep - .select("ServerName") - .groupBy("ServerName") - .count() - .withColumn("batch_count", F.floor(F.col("count") / BATCH_SIZE))) - - window_part = Window.partitionBy("ServerName").orderBy("ServerName") - master_df_filtered = (multimedia_df_prep - .withColumn("row_number", F.row_number().over(window_part)) - .join(servers_grouped, ["ServerName"]) - .withColumn("partition_id", F.col("row_number") % F.col("batch_count")) - .withColumn("partition_id", F.when(F.col("partition_id").isNull(), 0).otherwise(F.col("partition_id"))) - .select(*columns, "partition_id")) - - (master_df_filtered - .repartition("ServerName", "partition_id") - .write - .partitionBy("ServerName", "partition_id") - .mode("overwrite") - .format("parquet") - .save(servers_batched_dir)) - - spark.stop() diff --git a/src/submitter.py b/src/submitter.py deleted file mode 100644 index 01972c4..0000000 --- a/src/submitter.py +++ /dev/null @@ -1,150 +0,0 @@ -import math -import os -import argparse -import subprocess -from typing import List -from dotenv import load_dotenv - -import pandas as pd - -NUM_DOWNLOADERS: int = 1 -RECHECK = False -SCHEDULES: List[str] = [] - -# internal job record used by submitter for tracking progress -SUBMITTED_JOBS_FILE = "_jobs_ids.csv" - - -def get_env_vars(env_path): - """ - Fetch path information from .env for download and schedule directories. - Also, paths to slurm scripts and a bash python-slurm coordination script. - - Parameters: - env_path - String. Path to .env file. Ex: 'path/to/hpc.env'. - - Returns: - schedules_path - String. Path to schedule in download directory. - mpi_submitter_script - String. Path to bash script to coordinate Python and slurm scripts. - downloading_script - String. Path to a slurm script to run download. - verifying_script - String. Path to slurm script to run verifier. - """ - load_dotenv(env_path) - download_path = f"{os.getenv('PROCESSED_DATA_ROOT')}/{os.getenv('TIME_STAMP')}/{os.getenv('DOWNLOAD_DIR')}" - schedules_path = f"{download_path}/{os.getenv('DOWNLOADER_SCHEDULES_FOLDER')}" - mpi_submitter_script = os.getenv("MPI_SUBMITTER_SCRIPT") - downloading_script = os.getenv("DOWNLOADING_SCRIPT") - verifying_script = os.getenv("VERIFYING_SCRIPT") - - return schedules_path, mpi_submitter_script, downloading_script, verifying_script - - -def get_logs_offset(path: str) -> int: - if not os.path.exists(path): - return 0 - - dirs: List[int] = [int(_path) for _path in os.listdir(path) if os.path.isdir(f"{path}/{_path}")] - - dirs.sort(reverse=True) - if len(dirs) == 0: - return 0 - return dirs[0] - - -def get_id(output: bytes) -> int: - return int(output.decode().strip().split(" ")[-1]) - - -def submit_downloader(_schedule: str, iteration_id: int, dep_id: int, mpi_submitter_script: str, - downloading_script: str) -> int: - iteration = str(iteration_id).zfill(4) - output = subprocess.check_output(f"{mpi_submitter_script} " - f"{downloading_script} " - f"{_schedule} " - f"{iteration} " - f"{dep_id}", shell=True) - idx = get_id(output) - print(f"Submitted downloader {idx} for {_schedule}") - return idx - - -def submit_verifier(_schedule: str, iteration_id: int, mpi_submitter_script: str, verifying_script: str, - dep_id: int = None) -> int: - iteration = str(iteration_id).zfill(4) - - command_str = f"{mpi_submitter_script} {verifying_script} {_schedule} {iteration}" - if dep_id is not None: - command_str += f" {dep_id}" - if RECHECK: - command_str += " --recheck" - - output = subprocess.check_output(command_str, shell=True) - idx = get_id(output) - print(f"Submitted verifier {idx} for {_schedule}") - return idx - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--env-path", required=True, help="Path to .env file. Ex: 'path/to/hpc.env'.", nargs="?") - args = parser.parse_args() - schedules_path, mpi_submitter_script, downloading_script, verifying_script = get_env_vars(args.env_path) - - # manage scheduling, run jobs - schedules = SCHEDULES - if len(schedules) == 0: - schedules = [folder for folder in os.listdir(schedules_path) if os.path.isdir(f"{schedules_path}/{folder}")] - - for schedule in schedules: - if os.path.exists(f"{schedules_path}/{schedule}/_DONE"): - continue - submitted_jobs_path = f"{schedules_path}/{schedule}/{SUBMITTED_JOBS_FILE}" - - prev_jobs = pd.DataFrame({ - "job_id": pd.Series(dtype="int"), - "is_verification": pd.Series(dtype="bool") - }) - if os.path.exists(submitted_jobs_path): - prev_jobs = pd.read_csv(submitted_jobs_path) - prev_jobs = prev_jobs.to_dict("records") - offset = math.ceil(len(prev_jobs) / 2) - - if offset == 0 or not prev_jobs[-1]["is_verification"] or RECHECK: - verifier_id = submit_verifier(schedule, - offset, - mpi_submitter_script, - verifying_script) - prev_jobs.append({ - "job_id": verifier_id, - "is_verification": True - }) - offset += 1 - - for _ in range(NUM_DOWNLOADERS): - download_id = submit_downloader(schedule, - offset, - prev_jobs[-1]["job_id"], - mpi_submitter_script, - downloading_script) - prev_jobs.append({ - "job_id": download_id, - "is_verification": False - }) - - verifier_id = submit_verifier(schedule, - offset, - mpi_submitter_script, - verifying_script, - download_id) - prev_jobs.append({ - "job_id": verifier_id, - "is_verification": True - }) - - offset += 1 - - pd.DataFrame(prev_jobs).to_csv(submitted_jobs_path, index=False, header=True) - - -if __name__ == "__main__": - main() diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/utils.py b/src/utils/utils.py deleted file mode 100644 index bec8d5b..0000000 --- a/src/utils/utils.py +++ /dev/null @@ -1,183 +0,0 @@ -import os -import shutil -import sys -from collections import deque -from typing import List, Deque, Any, Dict - -import pandas as pd -from pyspark.sql import DataFrame, SparkSession - - -def print_progress(iteration, total, prefix='', suffix='', decimals=2, bar_length=100): - """ - Call in a loop to create terminal progress bar - @params: - iteration - Required : current iteration (Int) - total - Required : total iterations (Int) - prefix - Optional : prefix string (Str) - suffix - Optional : suffix string (Str) - decimals - Optional : positive number of decimals in percent complete (Int) - bar_length - Optional : character length of bar (Int) - """ - str_format = "{0:." + str(decimals) + "f}" - percents = str_format.format(100 * (iteration / float(total))) - filled_length = int(round(bar_length * iteration / float(total))) - bar = '@' * filled_length + '-' * (bar_length - filled_length) - - sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), - - if iteration == total: - sys.stdout.write('\n') - sys.stdout.flush() - - -def generate_analyzer_sheet(columns: list[tuple[str, str]]) -> list: - return [ - { - "name": name, - "type": column_type, - "is_null": False, - "is_nullable": False, - "sparsity": 0, - "is_unique": False, - "is_atomic": False, - "atomic_likelihood": False, - "description": "" - } - for name, column_type in columns - ] - - -def write_to_csv(path: str, result_df: DataFrame) -> None: - result_df.coalesce(1).write.csv(path, - header=True, - mode="overwrite", - sep="\t", - quote="\"", - quoteAll=True) - - -def write_to_parquet(path: str, result_df: DataFrame, num_parquet: int = 100) -> None: - if num_parquet > 0: - result_df = result_df.repartition(num_parquet) - - # Write the DataFrame to Parquet - result_df.write.mode('overwrite').parquet(path) - - -def load_dataframe(spark: SparkSession, input_path: str) -> DataFrame: - file_extension = input_path.split('.')[-1].lower() - - def infer_delimiter(first_line): - if '\t' in first_line: - return '\t' - elif ',' in first_line: - return ',' - elif ' ' in first_line: - return ' ' - elif '|' in first_line: - return '|' - elif ';' in first_line: - return ';' - else: - return None - - if file_extension in ['csv', 'tsv', 'txt']: - if file_extension == 'csv': - sep = ',' - elif file_extension == 'tsv': - sep = '\t' - elif file_extension == 'txt': - with open(input_path, 'r') as file: - first_line = file.readline() - sep = infer_delimiter(first_line) - if sep is None: - raise Exception(f"Could not infer delimiter for file {input_path}") - df = spark.read.csv(input_path, sep=sep, header=True) - else: - try: - df = spark.read.load(input_path) - except: - raise Exception(f"File not supported") - - return df - - -def ensure_created(list_of_path: List[str]) -> None: - for path in list_of_path: - os.makedirs(path, exist_ok=True) - - -def truncate_folder(path: str): - shutil.rmtree(path, ignore_errors=True) - os.makedirs(path, exist_ok=True) - - -def split_dataframe(df: pd.DataFrame, by_column: str = "Nodes", chunk_size=20) -> List[pd.DataFrame]: - chunks: List[pd.DataFrame] = [] - - row_list = df.to_dict("records") - - if len(row_list) == 0: - raise ValueError("Empty list") - - chunks.append(pd.DataFrame(row_list[0], index=[0])) - del row_list[0] - - while len(row_list) > 0: - i = 0 - - chunk = chunks[-1] - - while len(row_list) > 0 and i < len(row_list): - new_chunk = row_list[i] - column_value = chunk[by_column].sum() + new_chunk[by_column] - - if column_value <= chunk_size: - chunks[-1] = pd.concat([chunk, pd.DataFrame(new_chunk, index=[0])], ignore_index=True) - del row_list[i] - break - - i += 1 - else: - if len(row_list) == 0: - break - - chunks.append(pd.DataFrame(row_list[0], index=[0])) - del row_list[0] - - return chunks - - -def create_schedule_configs(group: pd.DataFrame, number_of_workers: int, schedule_path: str, - by_column: str = "Nodes") -> None: - print("Creating schedules") - - group = group.sort_values(by=[by_column], ascending=False).reset_index() - - chunked_group: Deque[pd.DataFrame] = deque(split_dataframe(group, by_column, number_of_workers)) - all_schedules = [int(folder) for folder in os.listdir(schedule_path) if os.path.isdir(f"{schedule_path}/{folder}")] - number_of_schedules = 0 - if len(all_schedules) > 0: - number_of_schedules: int = sorted(all_schedules, reverse=True)[0] + 1 - - while len(chunked_group) > 0: - chunk = chunked_group.popleft() - - while len(chunked_group) > 0 and chunk["TotalBatches"].sum() < number_of_workers * 50: - chunk = pd.concat([chunk, chunked_group.popleft()], ignore_index=True) - - chunk_folder = f"{schedule_path}/{number_of_schedules:0=4}" - os.mkdir(chunk_folder) - chunk.to_csv(f"{chunk_folder}/_config.csv", index=False, header=True) - - print(f"{number_of_schedules}={chunk['Nodes'].sum()}") - - number_of_schedules += 1 - - -def load_env(env: str) -> Dict[str, Any]: - from dotenv import load_dotenv, dotenv_values - - load_dotenv(env) - return dotenv_values(env)