From ff90ac91ffe9554802d98d53351a4deda5c655bb Mon Sep 17 00:00:00 2001
From: Ankit Solomon
Date: Fri, 6 Dec 2024 21:22:16 +0530
Subject: [PATCH] HBASE-28988 Enhance WALPlayer for restore of BulkLoad
---
.../test/IntegrationTestBigLinkedList.java | 3 +-
.../test/IntegrationTestLoadAndVerify.java | 3 +-
.../mapreduce/MultiTableOutputFormat.java | 59 ++++++++-
.../hadoop/hbase/mapreduce/WALPlayer.java | 122 +++++++++++++++++-
4 files changed, 174 insertions(+), 13 deletions(-)
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
index 58c329c0cd76..f836c06d2c3b 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
@@ -81,6 +81,7 @@
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Random64;
import org.apache.hadoop.hbase.util.RegionSplitter;
import org.apache.hadoop.hbase.wal.WALEdit;
@@ -945,7 +946,7 @@ public static class WALMapperSearcher extends WALMapper {
private AtomicInteger rows = new AtomicInteger(0);
@Override
- public void setup(Mapper.Context context)
+ public void setup(Mapper>>.Context context)
throws IOException {
super.setup(context);
try {
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestLoadAndVerify.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestLoadAndVerify.java
index 5566bd79cab0..fde93827dee0 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestLoadAndVerify.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestLoadAndVerify.java
@@ -70,6 +70,7 @@
import org.apache.hadoop.hbase.util.AbstractHBaseTool;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CommonFSUtils;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.io.BytesWritable;
@@ -387,7 +388,7 @@ public static class WALMapperSearcher extends WALMapper {
private AtomicInteger rows = new AtomicInteger(0);
@Override
- public void setup(Mapper.Context context)
+ public void setup(Mapper>>.Context context)
throws IOException {
super.setup(context);
try {
diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.java
index 35c12672deac..f75682977712 100644
--- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.java
+++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.java
@@ -18,10 +18,16 @@
package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Connection;
@@ -30,8 +36,12 @@
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.tool.BulkLoadHFiles;
import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.CommonFSUtils;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
@@ -56,7 +66,7 @@
*
*/
@InterfaceAudience.Public
-public class MultiTableOutputFormat extends OutputFormat {
+public class MultiTableOutputFormat extends OutputFormat>> {
/** Set this to {@link #WAL_OFF} to turn off write-ahead logging (WAL) */
public static final String WAL_PROPERTY = "hbase.mapreduce.multitableoutputformat.wal";
/** Property value to use write-ahead logging */
@@ -68,7 +78,7 @@ public class MultiTableOutputFormat extends OutputFormat {
+ extends RecordWriter>> {
private static final Logger LOG = LoggerFactory.getLogger(MultiTableRecordWriter.class);
Connection connection;
Map mutatorMap = new HashMap<>();
@@ -119,7 +129,15 @@ public void close(TaskAttemptContext context) throws IOException {
* either a put or a delete. if the action is not a put or a delete.
*/
@Override
- public void write(ImmutableBytesWritable tableName, Mutation action) throws IOException {
+ public void write(ImmutableBytesWritable tableName, Pair> action) throws IOException {
+ if (action.getFirst() != null) {
+ handleMutation(tableName, action.getFirst());
+ return;
+ }
+ handleBulkLoad(tableName, action.getSecond());
+ }
+
+ private void handleMutation(ImmutableBytesWritable tableName, Mutation action) throws IOException {
BufferedMutator mutator = getBufferedMutator(tableName);
// The actions are not immutable, so we defensively copy them
if (action instanceof Put) {
@@ -131,6 +149,39 @@ public void write(ImmutableBytesWritable tableName, Mutation action) throws IOEx
mutator.mutate(delete);
} else throw new IllegalArgumentException("action must be either Delete or Put");
}
+
+ private void handleBulkLoad(ImmutableBytesWritable tableName, List bulkLoadFiles)
+ throws IOException {
+
+ TableName table = TableName.valueOf(tableName.get());
+ LOG.info("Starting bulk load for table: {}", table);
+
+ BulkLoadHFiles bulkLoader = BulkLoadHFiles.create(conf);
+ LOG.info("Processing {} HFiles for bulk loading into table: {}", bulkLoadFiles.size(), table);
+
+ // This map will hold the family-to-files mapping needed for the bulk load operation
+ Map> family2Files = new HashMap<>();
+
+ try {
+ for (String file : bulkLoadFiles) {
+ Path filePath = new Path(file);
+ String family = filePath.getParent().getName();
+ byte[] familyBytes = Bytes.toBytes(family);
+
+ // Add the file to the list of files for the corresponding column family
+ family2Files.computeIfAbsent(familyBytes, k -> new ArrayList<>()).add(filePath);
+ LOG.info("Mapped file {} to family {}", filePath, family);
+ }
+
+ LOG.info("Executing bulk load into table: {}", table);
+ bulkLoader.bulkLoad(table, family2Files);
+
+ LOG.info("Bulk load completed successfully for table: {}", table);
+ } catch (IOException e) {
+ LOG.error("Error during bulk load for table: {}. Exception: {}", table, e.getMessage(), e);
+ throw new IOException("Failed to complete bulk load for table: " + table, e);
+ }
+ }
}
@Override
@@ -146,7 +197,7 @@ public OutputCommitter getOutputCommitter(TaskAttemptContext context)
}
@Override
- public RecordWriter getRecordWriter(TaskAttemptContext context)
+ public RecordWriter>> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
return new MultiTableRecordWriter(HBaseConfiguration.create(conf),
diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/WALPlayer.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/WALPlayer.java
index 99b1dd112b98..e1516ccb1024 100644
--- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/WALPlayer.java
+++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/WALPlayer.java
@@ -27,13 +27,17 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.ExtendedCell;
import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
@@ -46,9 +50,12 @@
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.TableInfo;
import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.MapReduceExtendedCell;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALEditInternalHelper;
import org.apache.hadoop.hbase.wal.WALKey;
@@ -79,6 +86,7 @@ public class WALPlayer extends Configured implements Tool {
public final static String INPUT_FILES_SEPARATOR_KEY = "wal.input.separator";
public final static String IGNORE_MISSING_FILES = "wal.input.ignore.missing.files";
public final static String MULTI_TABLES_SUPPORT = "wal.multi.tables.support";
+ public final static String BULKLOAD_BACKUP_LOCATION = "wal.bulk.backup.location";
protected static final String tableSeparator = ";";
@@ -156,7 +164,7 @@ protected static enum Counter {
* A mapper that writes out {@link Mutation} to be directly applied to a running HBase instance.
*/
protected static class WALMapper
- extends Mapper {
+ extends Mapper>> {
private Map tables = new TreeMap<>();
@Override
@@ -172,6 +180,52 @@ public void map(WALKey key, WALEdit value, Context context) throws IOException {
ExtendedCell lastCell = null;
for (ExtendedCell cell : WALEditInternalHelper.getExtendedCells(value)) {
context.getCounter(Counter.CELLS_READ).increment(1);
+
+ if (CellUtil.matchingQualifier(cell, WALEdit.BULK_LOAD)) {
+ String namespace = key.getTableName().getNamespaceAsString();
+ String tableName = key.getTableName().getQualifierAsString();
+ LOG.info("Processing bulk load for namespace: {}, table: {}", namespace, tableName);
+
+ List bulkloadFiles = handleBulkLoadCell(cell);
+ LOG.info("Found {} bulk load files for table: {}", bulkloadFiles.size(), tableName);
+
+ // Prefix each file path with namespace and table name to construct the full paths
+ List bulkloadFilesWithFullPath = bulkloadFiles.stream()
+ .map(filePath -> new Path(namespace, new Path(tableName, filePath)).toString())
+ .collect(Collectors.toList());
+ LOG.info("Bulk load files with full paths: {}", bulkloadFilesWithFullPath.size());
+
+ // Retrieve configuration and set up file systems for backup and staging locations
+ Configuration conf = context.getConfiguration();
+ Path backupLocation = new Path(conf.get(BULKLOAD_BACKUP_LOCATION));
+ FileSystem rootFs = CommonFSUtils.getRootDirFileSystem(conf); // HDFS filesystem
+ Path hbaseStagingDir = new Path(CommonFSUtils.getRootDir(conf), HConstants.BULKLOAD_STAGING_DIR_NAME);
+ FileSystem backupFs = FileSystem.get(backupLocation.toUri(), conf);
+
+ List stagingPaths = new ArrayList<>();
+
+ try {
+ for (String file : bulkloadFilesWithFullPath) {
+ // Full file path from S3
+ Path fullBackupFilePath = new Path(backupLocation, file);
+ // Staging path on HDFS
+ Path stagingPath = new Path(hbaseStagingDir, file);
+
+ LOG.info("Copying file from backup location (S3): {} to HDFS staging: {}", fullBackupFilePath, stagingPath);
+ // Copy the file from S3 to HDFS
+ FileUtil.copy(backupFs, fullBackupFilePath, rootFs, stagingPath, false, conf);
+
+ stagingPaths.add(stagingPath.toString());
+ }
+ } catch (IOException e) {
+ LOG.error("Error copying files for bulk load: {}", e.getMessage(), e);
+ throw new IOException("Failed to copy files for bulk load.", e);
+ }
+
+ Pair> p = new Pair<>(null, stagingPaths);
+ context.write(tableOut, p);
+ }
+
// Filtering WAL meta marker entries.
if (WALEdit.isMetaEditFamily(cell)) {
continue;
@@ -188,11 +242,13 @@ public void map(WALKey key, WALEdit value, Context context) throws IOException {
) {
// row or type changed, write out aggregate KVs.
if (put != null) {
- context.write(tableOut, put);
+ Pair> p = new Pair<>(put, null);
+ context.write(tableOut, p);
context.getCounter(Counter.PUTS).increment(1);
}
if (del != null) {
- context.write(tableOut, del);
+ Pair> p = new Pair<>(del, null);
+ context.write(tableOut, p);
context.getCounter(Counter.DELETES).increment(1);
}
if (CellUtil.isDelete(cell)) {
@@ -212,12 +268,14 @@ public void map(WALKey key, WALEdit value, Context context) throws IOException {
}
// write residual KVs
if (put != null) {
- context.write(tableOut, put);
+ Pair> p = new Pair<>(put, null);
+ context.write(tableOut, p);
context.getCounter(Counter.PUTS).increment(1);
}
if (del != null) {
+ Pair> p = new Pair<>(del, null);
+ context.write(tableOut, p);
context.getCounter(Counter.DELETES).increment(1);
- context.write(tableOut, del);
}
}
} catch (InterruptedException e) {
@@ -230,9 +288,56 @@ protected boolean filter(Context context, final Cell cell) {
return true;
}
+ private List handleBulkLoadCell(Cell cell) throws IOException {
+ List resultFiles = new ArrayList<>();
+ LOG.info("Bulk load detected in cell. Processing...");
+
+ WALProtos.BulkLoadDescriptor bld = WALEdit.getBulkLoadDescriptor(cell);
+
+ if (bld == null) {
+ LOG.info("BulkLoadDescriptor is null for cell: {}", cell);
+ return resultFiles;
+ }
+ if (!bld.getReplicate()) {
+ LOG.info("Replication is disabled for bulk load cell: {}", cell);
+ }
+
+ String regionName = bld.getEncodedRegionName().toStringUtf8();
+
+ LOG.info("Encoded region name: {}", regionName);
+
+ List storesList = bld.getStoresList();
+ if (storesList == null) {
+ LOG.info("Store descriptor list is null for region: {}", regionName);
+ return resultFiles;
+ }
+
+ for (WALProtos.StoreDescriptor storeDescriptor : storesList) {
+ String columnFamilyName = storeDescriptor.getFamilyName().toStringUtf8();
+ LOG.info("Processing column family: {}", columnFamilyName);
+
+ List storeFileList = storeDescriptor.getStoreFileList();
+ if (storeFileList == null) {
+ LOG.info("Store file list is null for column family: {}", columnFamilyName);
+ continue;
+ }
+
+ for (String storeFile : storeFileList) {
+ String hFilePath = getHFilePath(regionName, columnFamilyName, storeFile);
+ LOG.info("Adding HFile path to bulk load file paths: {}", hFilePath);
+ resultFiles.add(hFilePath);
+ }
+ }
+ return resultFiles;
+ }
+
+ private String getHFilePath(String regionName, String columnFamilyName, String storeFileName) {
+ return new Path(regionName, new Path(columnFamilyName, storeFileName)).toString();
+ }
+
@Override
protected void
- cleanup(Mapper.Context context)
+ cleanup(Mapper>>.Context context)
throws IOException, InterruptedException {
super.cleanup(context);
}
@@ -293,6 +398,8 @@ public Job createSubmittableJob(String[] args) throws IOException {
setupTime(conf, WALInputFormat.START_TIME_KEY);
setupTime(conf, WALInputFormat.END_TIME_KEY);
String inputDirs = args[0];
+ String walDir = new Path(inputDirs, "WALs").toString();
+ String bulkLoadFilesDir = new Path(inputDirs, "bulk-load-files").toString();
String[] tables = args.length == 1 ? new String[] {} : args[1].split(",");
String[] tableMap;
if (args.length > 2) {
@@ -306,7 +413,8 @@ public Job createSubmittableJob(String[] args) throws IOException {
}
conf.setStrings(TABLES_KEY, tables);
conf.setStrings(TABLE_MAP_KEY, tableMap);
- conf.set(FileInputFormat.INPUT_DIR, inputDirs);
+ conf.set(FileInputFormat.INPUT_DIR, walDir);
+ conf.set(BULKLOAD_BACKUP_LOCATION, bulkLoadFilesDir);
Job job = Job.getInstance(conf,
conf.get(JOB_NAME_CONF_KEY, NAME + "_" + EnvironmentEdgeManager.currentTime()));
job.setJarByClass(WALPlayer.class);