diff --git a/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java b/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java index 67074d196c672..010b73c5772a7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java +++ b/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java @@ -140,6 +140,9 @@ public void getQueryableReplicas(List allQuerableReplicas, List computeNodeIds = GlobalStateMgr.getCurrentState().getWarehouseMgr() .getAllComputeNodeIdsAssignToTablet(warehouseId, this); + if (computeNodeIds == null) { + return; + } for (long backendId : computeNodeIds) { Replica replica = new Replica(getId(), backendId, visibleVersion, schemaHash, getDataSize(true), getRowCount(visibleVersion), NORMAL, -1, visibleVersion); diff --git a/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java b/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java index 212da41656d81..89c067e4fd4cc 100644 --- a/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java +++ b/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java @@ -37,6 +37,7 @@ import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -207,8 +208,10 @@ public Set getAllComputeNodeIdsAssignToTablet(Long warehouseId, LakeTablet " warehouse %d", warehouseId)); } List computeNodeIds = systemInfoService.internalTabletMapper().computeNodesForTablet(tablet.getId()); - if (computeNodeIds == null) { - return null; + if (computeNodeIds == null || computeNodeIds.isEmpty()) { + LOG.warn("no compute nodes assigned to tablet {} in warehouse {}", tablet.getId(), + warehouseId); + return Collections.emptySet(); } return new HashSet<>(computeNodeIds); } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java index 804bda9c02395..47cb1e7db61b7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java @@ -163,7 +163,8 @@ private void analyzeProperties(Map properties, Set suppo continue; } // Support single level location label for now - String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]+\\s*)"; + // allow matching an empty val (after the colon) + String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]*\\s*)"; if (!Pattern.compile(regex).matcher(propVal).matches()) { throw new SemanticException("invalid 'location' or 'group' format: " + propVal + ", should be like: 'key:val'"); diff --git a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java index b15c3f13f6859..a485aa1958cf5 100644 --- a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java +++ b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java @@ -37,6 +37,8 @@ import com.starrocks.common.util.ConsistentHashRing; import com.starrocks.common.util.HashRing; import com.starrocks.server.GlobalStateMgr; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.TestOnly; import java.util.Collections; @@ -65,6 +67,7 @@ */ public class TabletComputeNodeMapper { + private static final Logger LOG = LogManager.getLogger(TabletComputeNodeMapper.class); private static final int CONSISTENT_HASH_RING_VIRTUAL_NUMBER = 256; private static final Long ARBITRARY_FAKE_TABLET = 1L; @@ -169,9 +172,12 @@ public void modifyComputeNode(Long computeNodeId, String oldResourceIsolationGroup, String newResourceIsolationGroup) { oldResourceIsolationGroup = getResourceIsolationGroupName(oldResourceIsolationGroup); newResourceIsolationGroup = getResourceIsolationGroupName(newResourceIsolationGroup); - if (oldResourceIsolationGroup.equals(newResourceIsolationGroup)) { - return; - } + // We run the following even if oldResourceIsolationGroup.equals(newResourceIsolationGroup) + // because we want to cleanly handle edge cases where the compute node hasn't already been + // added to the TabletComputeNode mapper. This can happen in at least one situation, which + // is when the cluster is first upgraded to include resource isolation groups. + // Because the host ips match during a CN restart, upstream code which adds the ComputeNodes + // will not execute and therefore we won't call this.addComputeNode. writeLock.lock(); try { removeComputeNodeUnsynchronized(computeNodeId, oldResourceIsolationGroup); @@ -195,7 +201,9 @@ public List computeNodesForTablet(Long tabletId, int count, String resourc readLock.lock(); try { if (!this.resourceIsolationGroupToTabletMapping.containsKey(resourceIsolationGroup)) { - return null; + LOG.warn(String.format("Requesting node for resource isolation group %s, to which" + + " there is not a known CN assigned.", resourceIsolationGroup)); + return Collections.emptyList(); } TabletMap m = this.resourceIsolationGroupToTabletMapping.get(resourceIsolationGroup); return m.tabletToComputeNodeId.get(tabletId, count); diff --git a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java index ee23c96f665e1..df3a372149121 100644 --- a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java @@ -143,13 +143,13 @@ public NodeMgr getNodeMgr() { public SystemInfoService getClusterInfo() { return systemInfo; } + @Mock public Frontend getMySelf() { return thisFe; } }; - TabletComputeNodeMapper tabletComputeNodeMapper = new TabletComputeNodeMapper(); tabletComputeNodeMapper.addComputeNode(1L, thisFe.getResourceIsolationGroup()); String otherResourceIsolationGroup = "someothergroup"; @@ -191,12 +191,12 @@ public TabletComputeNodeMapper internalTabletMapper() { mgr.initDefaultWarehouse(); LakeTablet arbitraryTablet = new LakeTablet(1001L); - Assert.assertEquals(Set.of(1L), mgr.getAllComputeNodeIdsAssignToTablet( - WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); + Assert.assertEquals(Set.of(1L), + mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); thisFe.setResourceIsolationGroup(otherResourceIsolationGroup); - Assert.assertEquals(Set.of(2L), mgr.getAllComputeNodeIdsAssignToTablet( - WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); + Assert.assertEquals(Set.of(2L), + mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); // Check that WarehouseManager.getAllComputeNodeIdsAssignToTablet delegates to // systemInfo.getAvailableComputeNodeIds. @@ -365,7 +365,10 @@ public RunMode getCurrentRunMode() { MaterializedIndex index = new MaterializedIndex(1, MaterializedIndex.IndexState.NORMAL); ErrorReportException ex = Assert.assertThrows(ErrorReportException.class, () -> scanNode.addScanRangeLocations(partition, partition, index, Collections.emptyList(), 1)); - Assert.assertEquals("No alive backend or compute node in warehouse null.", ex.getMessage()); + Assert.assertEquals( + "No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource " + + "isolation group matching the FE.", + ex.getMessage()); } private OlapScanNode newOlapScanNode() { diff --git a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java index c032c8c3fc965..ac19568355070 100644 --- a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java @@ -41,6 +41,7 @@ import org.junit.Before; import org.junit.Test; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -48,6 +49,7 @@ public class TabletComputeNodeMapperTest { private Frontend thisFe; + @Before public void setUp() { thisFe = new Frontend(); @@ -66,6 +68,17 @@ public void setUp() { public void tearDown() { } + @Test + public void modifyComputeNodeEdgeCases() throws Exception { + Long arbitraryTablet = 9000L; + TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); + Assert.assertEquals(0, mapper.numResourceIsolationGroups()); + Assert.assertEquals(Collections.emptyList(), mapper.computeNodesForTablet(arbitraryTablet, 1, "")); + mapper.modifyComputeNode(1L, "", ""); + Assert.assertEquals(1, mapper.numResourceIsolationGroups()); + Assert.assertEquals(List.of(1L), mapper.computeNodesForTablet(arbitraryTablet, 1, "")); + } + @Test public void testGroupManagementEdgeCase() throws Exception { TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); @@ -75,7 +88,6 @@ public void testGroupManagementEdgeCase() throws Exception { Assert.assertTrue(mapper.trackingNonDefaultResourceIsolationGroup()); } - @Test public void testGroupManagement() throws Exception { TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); @@ -143,7 +155,6 @@ public void testTabletToCnMapping() throws Exception { Assert.assertEquals(2, mapper.numResourceIsolationGroups()); - int tabletsToTry = 10000; long[] tabletIdToGroup2Primary = new long[tabletsToTry]; long[] tabletIdToGroup2Backup = new long[tabletsToTry];