From d7e9d31e1e4eda4715a68ec2936bdae0840d6f31 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 15 Oct 2024 15:59:00 -0700 Subject: [PATCH 1/3] bugfix for issue during upgrade --- .../main/java/com/starrocks/lake/LakeTablet.java | 3 +++ .../sql/analyzer/AlterSystemStmtAnalyzer.java | 3 ++- .../starrocks/system/TabletComputeNodeMapper.java | 14 +++++++++++--- .../system/TabletComputeNodeMapperTest.java | 11 +++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java b/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java index 67074d196c672..010b73c5772a7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java +++ b/fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java @@ -140,6 +140,9 @@ public void getQueryableReplicas(List allQuerableReplicas, List computeNodeIds = GlobalStateMgr.getCurrentState().getWarehouseMgr() .getAllComputeNodeIdsAssignToTablet(warehouseId, this); + if (computeNodeIds == null) { + return; + } for (long backendId : computeNodeIds) { Replica replica = new Replica(getId(), backendId, visibleVersion, schemaHash, getDataSize(true), getRowCount(visibleVersion), NORMAL, -1, visibleVersion); diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java index 804bda9c02395..47cb1e7db61b7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AlterSystemStmtAnalyzer.java @@ -163,7 +163,8 @@ private void analyzeProperties(Map properties, Set suppo continue; } // Support single level location label for now - String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]+\\s*)"; + // allow matching an empty val (after the colon) + String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]*\\s*)"; if (!Pattern.compile(regex).matcher(propVal).matches()) { throw new SemanticException("invalid 'location' or 'group' format: " + propVal + ", should be like: 'key:val'"); diff --git a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java index b15c3f13f6859..a3502ddd4a79e 100644 --- a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java +++ b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java @@ -37,6 +37,8 @@ import com.starrocks.common.util.ConsistentHashRing; import com.starrocks.common.util.HashRing; import com.starrocks.server.GlobalStateMgr; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.TestOnly; import java.util.Collections; @@ -65,6 +67,7 @@ */ public class TabletComputeNodeMapper { + private static final Logger LOG = LogManager.getLogger(TabletComputeNodeMapper.class); private static final int CONSISTENT_HASH_RING_VIRTUAL_NUMBER = 256; private static final Long ARBITRARY_FAKE_TABLET = 1L; @@ -169,9 +172,12 @@ public void modifyComputeNode(Long computeNodeId, String oldResourceIsolationGroup, String newResourceIsolationGroup) { oldResourceIsolationGroup = getResourceIsolationGroupName(oldResourceIsolationGroup); newResourceIsolationGroup = getResourceIsolationGroupName(newResourceIsolationGroup); - if (oldResourceIsolationGroup.equals(newResourceIsolationGroup)) { - return; - } + // We run the following even if oldResourceIsolationGroup.equals(newResourceIsolationGroup) + // because we want to cleanly handle edge cases where the compute node hasn't already been + // added to the TabletComputeNode mapper. This can happen in at least one situation, which + // is when the cluster is first upgraded to include resource isolation groups. + // Because the host ips match during a CN restart, upstream code which adds the ComputeNodes + // will not execute and therefore we won't call this.addComputeNode. writeLock.lock(); try { removeComputeNodeUnsynchronized(computeNodeId, oldResourceIsolationGroup); @@ -195,6 +201,8 @@ public List computeNodesForTablet(Long tabletId, int count, String resourc readLock.lock(); try { if (!this.resourceIsolationGroupToTabletMapping.containsKey(resourceIsolationGroup)) { + LOG.warn(String.format("Requesting node for resource isolation group %s, to which" + + " there is not a known CN assigned.", resourceIsolationGroup)); return null; } TabletMap m = this.resourceIsolationGroupToTabletMapping.get(resourceIsolationGroup); diff --git a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java index c032c8c3fc965..2308d14c39503 100644 --- a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java @@ -66,6 +66,17 @@ public void setUp() { public void tearDown() { } + @Test + public void modifyComputeNodeEdgeCases() throws Exception { + Long arbitraryTablet = 9000L; + TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); + Assert.assertEquals(0, mapper.numResourceIsolationGroups()); + Assert.assertNull(mapper.computeNodesForTablet(arbitraryTablet, 1, "")); + mapper.modifyComputeNode(1L, "", ""); + Assert.assertEquals(1, mapper.numResourceIsolationGroups()); + Assert.assertEquals(List.of(1L), mapper.computeNodesForTablet(arbitraryTablet, 1, "")); + } + @Test public void testGroupManagementEdgeCase() throws Exception { TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); From fe98b3774a32b9d60eb50ecc5b58a6fc80e8ccdf Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 16 Oct 2024 11:17:42 -0700 Subject: [PATCH 2/3] return empty collection instead of null if no cn available --- .../main/java/com/starrocks/server/WarehouseManager.java | 7 +++++-- .../java/com/starrocks/system/TabletComputeNodeMapper.java | 2 +- .../java/com/starrocks/server/WarehouseManagerTest.java | 2 +- .../com/starrocks/system/TabletComputeNodeMapperTest.java | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java b/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java index 212da41656d81..89c067e4fd4cc 100644 --- a/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java +++ b/fe/fe-core/src/main/java/com/starrocks/server/WarehouseManager.java @@ -37,6 +37,7 @@ import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -207,8 +208,10 @@ public Set getAllComputeNodeIdsAssignToTablet(Long warehouseId, LakeTablet " warehouse %d", warehouseId)); } List computeNodeIds = systemInfoService.internalTabletMapper().computeNodesForTablet(tablet.getId()); - if (computeNodeIds == null) { - return null; + if (computeNodeIds == null || computeNodeIds.isEmpty()) { + LOG.warn("no compute nodes assigned to tablet {} in warehouse {}", tablet.getId(), + warehouseId); + return Collections.emptySet(); } return new HashSet<>(computeNodeIds); } diff --git a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java index a3502ddd4a79e..a485aa1958cf5 100644 --- a/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java +++ b/fe/fe-core/src/main/java/com/starrocks/system/TabletComputeNodeMapper.java @@ -203,7 +203,7 @@ public List computeNodesForTablet(Long tabletId, int count, String resourc if (!this.resourceIsolationGroupToTabletMapping.containsKey(resourceIsolationGroup)) { LOG.warn(String.format("Requesting node for resource isolation group %s, to which" + " there is not a known CN assigned.", resourceIsolationGroup)); - return null; + return Collections.emptyList(); } TabletMap m = this.resourceIsolationGroupToTabletMapping.get(resourceIsolationGroup); return m.tabletToComputeNodeId.get(tabletId, count); diff --git a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java index ee23c96f665e1..fe50ff1ed2267 100644 --- a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java @@ -365,7 +365,7 @@ public RunMode getCurrentRunMode() { MaterializedIndex index = new MaterializedIndex(1, MaterializedIndex.IndexState.NORMAL); ErrorReportException ex = Assert.assertThrows(ErrorReportException.class, () -> scanNode.addScanRangeLocations(partition, partition, index, Collections.emptyList(), 1)); - Assert.assertEquals("No alive backend or compute node in warehouse null.", ex.getMessage()); + Assert.assertEquals("No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource isolation group matching the FE.", ex.getMessage()); } private OlapScanNode newOlapScanNode() { diff --git a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java index 2308d14c39503..86e0593bd600a 100644 --- a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java @@ -35,6 +35,7 @@ import com.starrocks.server.GlobalStateMgr; import com.starrocks.server.NodeMgr; +import java.util.Collections; import mockit.Expectations; import org.junit.After; import org.junit.Assert; @@ -71,7 +72,7 @@ public void modifyComputeNodeEdgeCases() throws Exception { Long arbitraryTablet = 9000L; TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); Assert.assertEquals(0, mapper.numResourceIsolationGroups()); - Assert.assertNull(mapper.computeNodesForTablet(arbitraryTablet, 1, "")); + Assert.assertEquals(Collections.emptyList(), mapper.computeNodesForTablet(arbitraryTablet, 1, "")); mapper.modifyComputeNode(1L, "", ""); Assert.assertEquals(1, mapper.numResourceIsolationGroups()); Assert.assertEquals(List.of(1L), mapper.computeNodesForTablet(arbitraryTablet, 1, "")); From d2c99225974c5e8d4cc4a6d30f52706c2fa89521 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 16 Oct 2024 15:10:00 -0700 Subject: [PATCH 3/3] lint --- .../starrocks/server/WarehouseManagerTest.java | 15 +++++++++------ .../system/TabletComputeNodeMapperTest.java | 5 ++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java index fe50ff1ed2267..df3a372149121 100644 --- a/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/server/WarehouseManagerTest.java @@ -143,13 +143,13 @@ public NodeMgr getNodeMgr() { public SystemInfoService getClusterInfo() { return systemInfo; } + @Mock public Frontend getMySelf() { return thisFe; } }; - TabletComputeNodeMapper tabletComputeNodeMapper = new TabletComputeNodeMapper(); tabletComputeNodeMapper.addComputeNode(1L, thisFe.getResourceIsolationGroup()); String otherResourceIsolationGroup = "someothergroup"; @@ -191,12 +191,12 @@ public TabletComputeNodeMapper internalTabletMapper() { mgr.initDefaultWarehouse(); LakeTablet arbitraryTablet = new LakeTablet(1001L); - Assert.assertEquals(Set.of(1L), mgr.getAllComputeNodeIdsAssignToTablet( - WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); + Assert.assertEquals(Set.of(1L), + mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); thisFe.setResourceIsolationGroup(otherResourceIsolationGroup); - Assert.assertEquals(Set.of(2L), mgr.getAllComputeNodeIdsAssignToTablet( - WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); + Assert.assertEquals(Set.of(2L), + mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet)); // Check that WarehouseManager.getAllComputeNodeIdsAssignToTablet delegates to // systemInfo.getAvailableComputeNodeIds. @@ -365,7 +365,10 @@ public RunMode getCurrentRunMode() { MaterializedIndex index = new MaterializedIndex(1, MaterializedIndex.IndexState.NORMAL); ErrorReportException ex = Assert.assertThrows(ErrorReportException.class, () -> scanNode.addScanRangeLocations(partition, partition, index, Collections.emptyList(), 1)); - Assert.assertEquals("No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource isolation group matching the FE.", ex.getMessage()); + Assert.assertEquals( + "No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource " + + "isolation group matching the FE.", + ex.getMessage()); } private OlapScanNode newOlapScanNode() { diff --git a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java index 86e0593bd600a..ac19568355070 100644 --- a/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/system/TabletComputeNodeMapperTest.java @@ -35,13 +35,13 @@ import com.starrocks.server.GlobalStateMgr; import com.starrocks.server.NodeMgr; -import java.util.Collections; import mockit.Expectations; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -49,6 +49,7 @@ public class TabletComputeNodeMapperTest { private Frontend thisFe; + @Before public void setUp() { thisFe = new Frontend(); @@ -87,7 +88,6 @@ public void testGroupManagementEdgeCase() throws Exception { Assert.assertTrue(mapper.trackingNonDefaultResourceIsolationGroup()); } - @Test public void testGroupManagement() throws Exception { TabletComputeNodeMapper mapper = new TabletComputeNodeMapper(); @@ -155,7 +155,6 @@ public void testTabletToCnMapping() throws Exception { Assert.assertEquals(2, mapper.numResourceIsolationGroups()); - int tabletsToTry = 10000; long[] tabletIdToGroup2Primary = new long[tabletsToTry]; long[] tabletIdToGroup2Backup = new long[tabletsToTry];