diff --git a/.gitignore b/.gitignore index a00bc94..79a1ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ test.log error.log rebalance_db.txt -testing_data \ No newline at end of file +rebalance.db +testing_data diff --git a/README.md b/README.md index a7b6564..ed02822 100644 --- a/README.md +++ b/README.md @@ -16,17 +16,23 @@ Note that this process is not entirely "in-place", since a file has to be fully At no point in time are both versions of the original file deleted. To make sure file attributes, permissions and file content are maintained when copying the original file, all attributes and the file checksum is compared before removing the original file (if not disabled using `--checksum false`). -Since file attributes are fully retained, it is not possible to verify if an individual file has been rebalanced. However, this script keeps track of rebalanced files by maintaining a "database" file in its working directory called `rebalance_db.txt` (if not disabled using `--passes 0`). This file contains two lines of text for each processed file: +Since file attributes are fully retained, it is not possible to verify if an individual file has been rebalanced. However, this script keeps track of rebalanced files by maintaining a database in its working directory called `rebalance.db` (if not disabled using `--passes 0`). This file is a standard SQLite 3 database, containing a single table `balancing`: -* One line for the file path -* and the next line for the current count of rebalance passes +* `file` column contains full file path +* `passes` column contains current count of rebalance passes -```text -/my/example/pool/file1.mkv -1 -/my/example/pool/file2.mkv -1 ``` +# sqlite3 rebalance.db +sqlite> .mode column +sqlite> .headers on +sqlite> SELECT * FROM balancing LIMIT 3; +file passes +-------------------------- ------ +/my/example/pool/file1.mkv 1 +/my/example/pool/file2.mkv 2 +/my/example/pool/file3.mkv 1 +``` + ## Prerequisites @@ -83,6 +89,7 @@ chmod +x ./zfs-inplace-rebalancing.sh Dependencies: * `perl` - it should be available on most systems by default +* `sqlite3` - it is installed by default on TrueNAS/Ubuntu/macOS, and available as an optional package in other distributions ## Usage @@ -118,6 +125,7 @@ To keep track of the balancing progress, you can open another terminal and run: watch zpool list -v ``` + ### Log to File To write the output to a file, simply redirect stdout and stderr to a file (or separate files). @@ -138,6 +146,11 @@ When aborting the script midway through, be sure to check the last lines of its Although the `--passes` parameter can be used to limit the maximum amount of rebalance passes per file, it is only meant to speedup aborted runs. Individual files will **not be process multiple times automatically**. To reach multiple passes you have to run the script on the same target directory multiple times. + +### Legacy database + +In previous versions of the script, a different format of the database was used. The database was stored in `rebalance_db.txt`. If this file is present upon running the newer versions, the run will be aborted. You either delete `rebalance_db.txt`, if you don't care about previous balances. This repository also provides `convert-legacy-db.sh` that is able to convert the legacy `rebalance_db.txt` database file into new `rebalance.db` one. + ### Dockerfile To increase portability, this script can also be run using docker: diff --git a/convert-legacy-db.sh b/convert-legacy-db.sh new file mode 100644 index 0000000..c221416 --- /dev/null +++ b/convert-legacy-db.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -eu + +#### +#### This script converts legacy "rebalance_db.txt" database file to new "rebalance.db" format +#### It utilizes a CSV intermediary as this is the fastest way of ingesting a large dataset into SQLite +#### + +rebalance_db_file_name='rebalance_db.txt' +rebalance_sqldb_file='rebalance.db' +rebalance_csv_tmp='rebalance.csv_tmp' + +echo "Importing ${rebalance_db_file_name} into ${rebalance_sqldb_file}..." + +echo "Creating SQL database at ${rebalance_sqldb_file}" +# ensures it's FOR SURE an empty db +sqlite3 "${rebalance_sqldb_file}" 'create table balancing (file string primary key, passes integer)' + +total=$(($(cat "${rebalance_db_file_name}" | wc -l) / 2)) +done=0 +path='' +echo "Generating CSV at ${rebalance_csv_tmp}" +echo -n > "${rebalance_csv_tmp}" +while IFS="" read -r line || [ -n "$line" ]; do + if [[ -z "${path}" ]]; then + path="${line}" + continue + fi + + echo "\"${path//\"/\"\"}\",${line}" >> "${rebalance_csv_tmp}" + path='' + echo -e -n "\r=> Generated $((done+=1)) of ${total} lines" +done < "./${rebalance_db_file_name}" +echo -e "\r=> Processed ${total} items to CSV at ${rebalance_csv_tmp}" + +echo "Importing data to ${rebalance_sqldb_file}..." +sqlite3 "${rebalance_sqldb_file}" '.mode csv' ".import ${rebalance_csv_tmp} balancing" + +echo "Optimizing database..." +sqlite3 "${rebalance_sqldb_file}" 'VACUUM' + +echo 'Cleaning up...' +rm "${rebalance_csv_tmp}" +mv "${rebalance_db_file_name}" "${rebalance_sqldb_file}_legacy" diff --git a/testing.sh b/testing.sh index b46f74f..307e799 100755 --- a/testing.sh +++ b/testing.sh @@ -14,7 +14,7 @@ function prepare() { # cleanup rm -f $log_std_file rm -f $log_error_file - rm -f rebalance_db.txt + rm -f rebalance.db rm -rf $test_pool_data_path # setup diff --git a/zfs-inplace-rebalancing.sh b/zfs-inplace-rebalancing.sh index b66a01f..6965677 100755 --- a/zfs-inplace-rebalancing.sh +++ b/zfs-inplace-rebalancing.sh @@ -5,8 +5,13 @@ set -e # exit on undeclared variable set -u -# file used to track processed files -rebalance_db_file_name="rebalance_db.txt" +# processed files database runtime variables +rebalance_db_file_name="rebalance.db" + +# keeps changes before these are persisted to the database +rebalance_db_cache='' #database filename +rebalance_db_save_interval=60 # how often changes are persisted to the database in seconds +rebalance_db_last_save=$SECONDS # when the database was last persisted # index used for progress current_index=0 @@ -36,24 +41,63 @@ function color_echo () { echo -e "${color}${text}${Color_Off}" } +# Loads existing rebalance database, or creates a new one. Requires no parameters. +function init_database () { + if [[ "${passes_flag}" -le 0 ]]; then + echo "skipped (--passes <= 0 requested)" + return + fi + + if [[ ! -r "${rebalance_db_file_name}" ]]; then # database unreadable => either no db or no permissions + # try to create a new db - if this is a permission problem this will crash [as intended] + sqlite3 "${rebalance_db_file_name}" "CREATE TABLE balancing (file string primary key, passes integer)" + echo "initialized in ${rebalance_db_file_name}" + else # db is readable - do a simple sanity check to make sure it isn't broken/locked + local balanced + balanced=$(sqlite3 "${rebalance_db_file_name}" "SELECT COUNT(*) FROM balancing") + echo "found ${balanced} records in ${rebalance_db_file_name}" + fi +} +# Provides number of already completed balancing passes for a given file +# Use: get_rebalance_count "/path/to/file" +# Output: a non-negative integer function get_rebalance_count () { - file_path=$1 + local count + count=$(sqlite3 "${rebalance_db_file_name}" "SELECT passes FROM balancing WHERE file = '${1//'/\'}'") + echo "${count:-0}" +} - line_nr=$(grep -xF -n "${file_path}" "./${rebalance_db_file_name}" | head -n 1 | cut -d: -f1) - if [ -z "${line_nr}" ]; then - echo "0" - return - else - rebalance_count_line_nr="$((line_nr + 1))" - rebalance_count=$(awk "NR == ${rebalance_count_line_nr}" "./${rebalance_db_file_name}") - echo "${rebalance_count}" - return +function persist_database () { + color_echo "${Cyan}" "Flushing database changes..." + sqlite3 "${rebalance_db_file_name}" <<< "BEGIN TRANSACTION;${rebalance_db_cache};COMMIT;" + rebalance_db_cache='' + rebalance_db_last_save=$SECONDS +} + +# Sets number of completed balancing passes for a given file +# Use: set_rebalance_count "/path/to/file" 123 +function set_rebalance_count () { + rebalance_db_cache="${rebalance_db_cache};INSERT OR REPLACE INTO balancing VALUES('${1//'/\'}', $2);" + color_echo "${Green}" "File $1 completed $2 rebalance cycles" + + # this is slightly "clever", as there's no way to access monotonic time in shell. + # $SECONDS contains a wall clock time since shell starting, but it's affected + # by timezones and system time changes. "time_since_last" will calculate absolute + # difference since last DB save. It may not be correct, but unless the time + # changes constantly, it will save *at least* every $rebalance_db_save_time + local time_now=$SECONDS + local time_since_last=$(($time_now >= $rebalance_db_last_save ? $time_now - $rebalance_db_last_save : $rebalance_db_last_save - $time_now)) + if [[ $time_since_last -gt $rebalance_db_save_interval ]]; then + persist_database fi } -# rebalance a specific file +# Rebalance a specific file +# Use: rebalance "/path/to/file" +# Output: log lines function rebalance () { + local file_path file_path=$1 # check if file has >=2 links in the case of --skip-hardlinks @@ -69,22 +113,26 @@ function rebalance () { fi current_index="$((current_index + 1))" - progress_percent=$(perl -e "printf('%0.2f', ${current_index}*100/${file_count})") - color_echo "${Cyan}" "Progress -- Files: ${current_index}/${file_count} (${progress_percent}%)" + progress_percent=$(perl -e "printf('%0.2f', ${current_index}*100/${file_count})") + color_echo "${Cyan}" "Progress -- Files: ${current_index}/${file_count} (${progress_percent}%)" if [[ ! -f "${file_path}" ]]; then - color_echo "${Yellow}" "File is missing, skipping: ${file_path}" + color_echo "${Yellow}" "File is missing, skipping: ${file_path}" fi - if [ "${passes_flag}" -ge 1 ]; then - # check if target rebalance count is reached + + if [[ "${passes_flag}" -ge 1 ]]; then + # this count is reused later to update database + local rebalance_count rebalance_count=$(get_rebalance_count "${file_path}") - if [ "${rebalance_count}" -ge "${passes_flag}" ]; then - color_echo "${Yellow}" "Rebalance count (${passes_flag}) reached, skipping: ${file_path}" - return + + # check if target rebalance count is reached + if [[ "${rebalance_count}" -ge "${passes_flag}" ]]; then + color_echo "${Yellow}" "Rebalance count of ${passes_flag} reached (${rebalance_count}), skipping: ${file_path}" + return fi fi - + tmp_extension=".balance" tmp_file_path="${file_path}${tmp_extension}" @@ -172,17 +220,7 @@ function rebalance () { mv "${tmp_file_path}" "${file_path}" if [ "${passes_flag}" -ge 1 ]; then - # update rebalance "database" - line_nr=$(grep -xF -n "${file_path}" "./${rebalance_db_file_name}" | head -n 1 | cut -d: -f1) - if [ -z "${line_nr}" ]; then - rebalance_count=1 - echo "${file_path}" >> "./${rebalance_db_file_name}" - echo "${rebalance_count}" >> "./${rebalance_db_file_name}" - else - rebalance_count_line_nr="$((line_nr + 1))" - rebalance_count="$((rebalance_count + 1))" - sed -i "${rebalance_count_line_nr}s/.*/${rebalance_count}/" "./${rebalance_db_file_name}" - fi + set_rebalance_count "${file_path}" $((rebalance_count + 1)) fi } @@ -224,14 +262,21 @@ while true ; do *) break ;; - esac + esac done; root_path=$1 -color_echo "$Cyan" "Start rebalancing $(date):" +# ensure we don't do something unexpected +if [[ -r "rebalance_db.txt" ]]; then + color_echo "${Red}" 'Found legacy database file in "rebalance_db.txt". To avoid possible unintended operations the process will terminate. You can either convert the legacy database using "convert-legacy-db.sh" script, or simply delete/rename "rebalance_db.txt"' + exit 2 +fi + +color_echo "$Cyan" "Start rebalancing:" color_echo "$Cyan" " Path: ${root_path}" color_echo "$Cyan" " Rebalancing Passes: ${passes_flag}" +color_echo "$Cyan" " Rebalancing DB: $(init_database)" color_echo "$Cyan" " Use Checksum: ${checksum_flag}" color_echo "$Cyan" " Skip Hardlinks: ${skip_hardlinks_flag}" @@ -244,11 +289,6 @@ fi color_echo "$Cyan" " File count: ${file_count}" -# create db file -if [ "${passes_flag}" -ge 1 ]; then - touch "./${rebalance_db_file_name}" -fi - # recursively scan through files and execute "rebalance" procedure # in the case of --skip-hardlinks, only find files with links == 1 if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then @@ -257,6 +297,9 @@ else find "$root_path" -type f -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done fi +# There may be some pending changes as we will almost never hit the interval perfectly - flush it +persist_database + echo "" echo "" color_echo "$Green" "Done!"