diff --git a/.github/workflows/slow_test.yml b/.github/workflows/slow_test.yml index af61d7c185..5c18ecf363 100644 --- a/.github/workflows/slow_test.yml +++ b/.github/workflows/slow_test.yml @@ -77,7 +77,8 @@ jobs: - name: Collect infinity release output if: ${{ !cancelled() && failure() }} # always run this step even if previous steps failed - run: (cat release.log 2>/dev/null; cat /var/infinity/log/infinity.log 2>/dev/null) || true + run: | + sudo python3 scripts/collect_log.py --log_path=/var/infinity/log/infinity.log --stdout_path=release.log --output_dir=${RUNNER_WORKSPACE_PREFIX}/log - name: Prepare restart test data if: ${{ !cancelled() && !failure() }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0cf17bb721..b71619127b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -108,7 +108,8 @@ jobs: - name: Collect infinity debug output if: ${{ !cancelled() && failure() }} # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. - run: (cat debug.log 2>/dev/null; cat /var/infinity/log/infinity.log 2>/dev/null) || true + run: | + sudo python3 scripts/collect_log.py --log_path=/var/infinity/log/infinity.log --stdout_path=debug.log --output_dir=${RUNNER_WORKSPACE_PREFIX}/log - name: Start infinity debug version with vfs off if: ${{ !cancelled() && !failure() }} @@ -134,7 +135,8 @@ jobs: - name: Collect infinity debug output if: ${{ !cancelled() && failure() }} # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. - run: (cat vfs_debug.log 2>/dev/null; cat /var/infinity/log/infinity.log 2>/dev/null) || true + run: | + sudo python3 scripts/collect_log.py --log_path=/var/infinity/log/infinity.log --stdout_path=vfs_debug.log --output_dir=${RUNNER_WORKSPACE_PREFIX}/log - name: Unit test debug version if: ${{ !cancelled() && !failure() }} @@ -247,7 +249,8 @@ jobs: - name: Collect infinity release output if: ${{ !cancelled() && failure() }} # always run this step even if previous steps failed # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. - run: (cat release.log 2>/dev/null; cat /var/infinity/log/infinity.log 2>/dev/null) || true + run: | + sudo python3 scripts/collect_log.py --log_path=/var/infinity/log/infinity.log --stdout_path=release.log --output_dir=${RUNNER_WORKSPACE_PREFIX}/log - name: Start infinity release version with vfs off if: ${{ !cancelled() && !failure() }} @@ -273,7 +276,8 @@ jobs: - name: Collect infinity release output if: ${{ !cancelled() && failure() }} # always run this step even if previous steps failed # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. - run: (cat release.log 2>/dev/null; cat /var/infinity/log/infinity.log 2>/dev/null) || true + run: | + sudo python3 scripts/collect_log.py --log_path=/var/infinity/log/infinity.log --stdout_path=vfs_release.log --output_dir=${RUNNER_WORKSPACE_PREFIX}/log - name: Unit test release version if: ${{ !cancelled() && !failure() }} diff --git a/conf/pytest_parallel_infinity_conf.toml b/conf/pytest_parallel_infinity_conf.toml index 778813025a..f5efd734fe 100644 --- a/conf/pytest_parallel_infinity_conf.toml +++ b/conf/pytest_parallel_infinity_conf.toml @@ -8,6 +8,8 @@ server_address = "0.0.0.0" [log] log_filename = "infinity.log" log_dir = "/var/infinity/log" +log_to_stdout = true +log_level = "trace" [storage] persistence_dir = "/var/infinity/persistence" diff --git a/conf/pytest_parallel_infinity_vfs_off.toml b/conf/pytest_parallel_infinity_vfs_off.toml index 47a3f57a9c..08af3aa21b 100644 --- a/conf/pytest_parallel_infinity_vfs_off.toml +++ b/conf/pytest_parallel_infinity_vfs_off.toml @@ -8,6 +8,8 @@ server_address = "0.0.0.0" [log] log_filename = "infinity.log" log_dir = "/var/infinity/log" +log_to_stdout = true +log_level = "trace" [storage] data_dir = "/var/infinity/data" diff --git a/scripts/collect_log.py b/scripts/collect_log.py new file mode 100644 index 0000000000..17ae84215f --- /dev/null +++ b/scripts/collect_log.py @@ -0,0 +1,41 @@ +import os +import shutil +import random +import string +import argparse + +parser = argparse.ArgumentParser(description="Collect and copy log files.") +parser.add_argument( + "--log_path", type=str, required=True, help="Path to the infinity log file" +) +parser.add_argument( + "--stdout_path", type=str, required=True, help="Path to the output debug log file" +) +parser.add_argument( + "--output_dir", + type=str, + default="/var/infinity/log", + help="Path to the output directory", +) +args = parser.parse_args() + +log_path = args.log_path +stdout_path = args.stdout_path +output_dir = args.output_dir + +if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + +random_name = "".join(random.choices(string.ascii_lowercase + string.digits, k=8)) +print(f"Random log file name: {random_name}") + +if not os.path.isfile(stdout_path): + print("Error: stdout file not found") +else: + shutil.copy(stdout_path, f"{output_dir}/{random_name}_.log") + +if not os.path.isfile(log_path): + print("Error: /var/infinity/log/infinity.log not found") +else: + shutil.copy(log_path, f"{output_dir}/{random_name}.log") diff --git a/src/main/cluster_manager.cpp b/src/main/cluster_manager.cpp index c530d3e251..ccf97b72df 100644 --- a/src/main/cluster_manager.cpp +++ b/src/main/cluster_manager.cpp @@ -246,6 +246,7 @@ void ClusterManager::CheckHeartBeatInner() { if (node_info->node_status_ == NodeStatus::kAlive) { if (node_info->last_update_ts_ + 2 * leader_node_->heartbeat_interval_ < this_node_->last_update_ts_) { node_info->node_status_ = NodeStatus::kTimeout; + LOG_INFO(fmt::format("Node {} is timeout", node_name)); } } } diff --git a/src/storage/buffer/file_worker/file_worker.cpp b/src/storage/buffer/file_worker/file_worker.cpp index f6c0fc98ac..9325c033c6 100644 --- a/src/storage/buffer/file_worker/file_worker.cpp +++ b/src/storage/buffer/file_worker/file_worker.cpp @@ -106,9 +106,8 @@ bool FileWorker::WriteToFile(bool to_spill, const FileWorkerSaveCtx &ctx) { void FileWorker::ReadFromFile(bool from_spill) { LocalFileSystem fs; - String read_path; bool use_object_cache = !from_spill && persistence_manager_ != nullptr; - read_path = fmt::format("{}/{}", ChooseFileDir(from_spill), *file_name_); + String read_path = fmt::format("{}/{}", ChooseFileDir(from_spill), *file_name_); if (use_object_cache) { obj_addr_ = persistence_manager_->GetObjCache(read_path); if (!obj_addr_.Valid()) { @@ -134,7 +133,7 @@ void FileWorker::ReadFromFile(bool from_spill) { file_handler_->Close(); file_handler_ = nullptr; if (use_object_cache && obj_addr_.Valid()) { - read_path = fmt::format("{}/{}", ChooseFileDir(from_spill), *file_name_); + String read_path = fmt::format("{}/{}", ChooseFileDir(from_spill), *file_name_); persistence_manager_->PutObjCache(read_path); } }); diff --git a/src/storage/meta/entry/base_entry.cpp b/src/storage/meta/entry/base_entry.cpp index 6ed7239da8..dfa1c47d41 100644 --- a/src/storage/meta/entry/base_entry.cpp +++ b/src/storage/meta/entry/base_entry.cpp @@ -39,6 +39,11 @@ bool BaseEntry::CheckVisible(Txn *txn) const { UnrecoverableError(error_message); } // Check if the entry is in committing process, because commit_ts of the base_entry is set in the Txn::CommitBottom + if (txn_id_ == 0) { + // could not check if the entry is visible accurately. log a warning and return true + LOG_WARN(fmt::format("Entry {} txn id is not set", *encode_)); + return true; + } return txn_mgr->CheckIfCommitting(txn_id_, begin_ts); } diff --git a/src/storage/persistence/persistence_manager.cpp b/src/storage/persistence/persistence_manager.cpp index d3630bbdf1..333c2d1555 100644 --- a/src/storage/persistence/persistence_manager.cpp +++ b/src/storage/persistence/persistence_manager.cpp @@ -320,7 +320,7 @@ ObjAddr PersistenceManager::GetObjCache(const String &file_path) { void PersistenceManager::PutObjCache(const String &file_path) { String local_path = RemovePrefix(file_path); if (local_path.empty()) { - String error_message = fmt::format("Failed to find local path of {}", local_path); + String error_message = fmt::format("Failed to find file path of {}", file_path); UnrecoverableError(error_message); } @@ -330,13 +330,17 @@ void PersistenceManager::PutObjCache(const String &file_path) { String error_message = fmt::format("Failed to find file_path: {} stored object", local_path); UnrecoverableError(error_message); } - assert(it->second.part_size_ != 0); + if (it->second.part_size_ == 0) { + UnrecoverableError(fmt::format("PutObjCache object {} part size is 0", it->second.obj_key_)); + } auto oit = objects_.find(it->second.obj_key_); if (oit == objects_.end()) { return; } - assert(oit->second.ref_count_ > 0); + if (oit->second.ref_count_ <= 0) { + UnrecoverableError(fmt::format("PutObjCache object {} ref count is {}", it->second.obj_key_, oit->second.ref_count_)); + } oit->second.ref_count_--; } diff --git a/test/data/config/restart_test/test_memidx/1.toml b/test/data/config/restart_test/test_memidx/1.toml index 4eac098af0..f585861273 100644 --- a/test/data/config/restart_test/test_memidx/1.toml +++ b/test/data/config/restart_test/test_memidx/1.toml @@ -5,6 +5,7 @@ time_zone = "utc-8" [network] [log] log_to_stdout = true +log_level = "trace" [storage] optimize_interval = "0s" diff --git a/test/data/config/restart_test/test_memidx/2.toml b/test/data/config/restart_test/test_memidx/2.toml index 197131074a..a3b80fde6b 100644 --- a/test/data/config/restart_test/test_memidx/2.toml +++ b/test/data/config/restart_test/test_memidx/2.toml @@ -5,6 +5,7 @@ time_zone = "utc-8" [network] [log] log_to_stdout = true +log_level = "trace" [storage] optimize_interval = "0s" diff --git a/test/data/config/restart_test/test_memidx/3.toml b/test/data/config/restart_test/test_memidx/3.toml index 565828eba7..25c768e90d 100644 --- a/test/data/config/restart_test/test_memidx/3.toml +++ b/test/data/config/restart_test/test_memidx/3.toml @@ -5,6 +5,7 @@ time_zone = "utc-8" [network] [log] log_to_stdout = true +log_level = "trace" [storage] optimize_interval = "1s"