Skip to content

Commit

Permalink
Merge pull request #14 from pinterest/cbrennan/rig_fix
Browse files Browse the repository at this point in the history
Bugfix NPE that occurs during version upgrade with empty/default resource isolation group
  • Loading branch information
ctbrennan authored Oct 18, 2024
2 parents bec060d + d2c9922 commit dbccef4
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 15 deletions.
3 changes: 3 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ public void getQueryableReplicas(List<Replica> allQuerableReplicas, List<Replica
long visibleVersion, long localBeId, int schemaHash, long warehouseId) {
Set<Long> computeNodeIds = GlobalStateMgr.getCurrentState().getWarehouseMgr()
.getAllComputeNodeIdsAssignToTablet(warehouseId, this);
if (computeNodeIds == null) {
return;
}
for (long backendId : computeNodeIds) {
Replica replica = new Replica(getId(), backendId, visibleVersion, schemaHash, getDataSize(true),
getRowCount(visibleVersion), NORMAL, -1, visibleVersion);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -207,8 +208,10 @@ public Set<Long> getAllComputeNodeIdsAssignToTablet(Long warehouseId, LakeTablet
" warehouse %d", warehouseId));
}
List<Long> computeNodeIds = systemInfoService.internalTabletMapper().computeNodesForTablet(tablet.getId());
if (computeNodeIds == null) {
return null;
if (computeNodeIds == null || computeNodeIds.isEmpty()) {
LOG.warn("no compute nodes assigned to tablet {} in warehouse {}", tablet.getId(),
warehouseId);
return Collections.emptySet();
}
return new HashSet<>(computeNodeIds);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ private void analyzeProperties(Map<String, String> properties, Set<String> suppo
continue;
}
// Support single level location label for now
String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]+\\s*)";
// allow matching an empty val (after the colon)
String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]*\\s*)";
if (!Pattern.compile(regex).matcher(propVal).matches()) {
throw new SemanticException("invalid 'location' or 'group' format: " + propVal +
", should be like: 'key:val'");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import com.starrocks.common.util.ConsistentHashRing;
import com.starrocks.common.util.HashRing;
import com.starrocks.server.GlobalStateMgr;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.TestOnly;

import java.util.Collections;
Expand Down Expand Up @@ -65,6 +67,7 @@
*/

public class TabletComputeNodeMapper {
private static final Logger LOG = LogManager.getLogger(TabletComputeNodeMapper.class);

private static final int CONSISTENT_HASH_RING_VIRTUAL_NUMBER = 256;
private static final Long ARBITRARY_FAKE_TABLET = 1L;
Expand Down Expand Up @@ -169,9 +172,12 @@ public void modifyComputeNode(Long computeNodeId,
String oldResourceIsolationGroup, String newResourceIsolationGroup) {
oldResourceIsolationGroup = getResourceIsolationGroupName(oldResourceIsolationGroup);
newResourceIsolationGroup = getResourceIsolationGroupName(newResourceIsolationGroup);
if (oldResourceIsolationGroup.equals(newResourceIsolationGroup)) {
return;
}
// We run the following even if oldResourceIsolationGroup.equals(newResourceIsolationGroup)
// because we want to cleanly handle edge cases where the compute node hasn't already been
// added to the TabletComputeNode mapper. This can happen in at least one situation, which
// is when the cluster is first upgraded to include resource isolation groups.
// Because the host ips match during a CN restart, upstream code which adds the ComputeNodes
// will not execute and therefore we won't call this.addComputeNode.
writeLock.lock();
try {
removeComputeNodeUnsynchronized(computeNodeId, oldResourceIsolationGroup);
Expand All @@ -195,7 +201,9 @@ public List<Long> computeNodesForTablet(Long tabletId, int count, String resourc
readLock.lock();
try {
if (!this.resourceIsolationGroupToTabletMapping.containsKey(resourceIsolationGroup)) {
return null;
LOG.warn(String.format("Requesting node for resource isolation group %s, to which"
+ " there is not a known CN assigned.", resourceIsolationGroup));
return Collections.emptyList();
}
TabletMap m = this.resourceIsolationGroupToTabletMapping.get(resourceIsolationGroup);
return m.tabletToComputeNodeId.get(tabletId, count);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ public NodeMgr getNodeMgr() {
public SystemInfoService getClusterInfo() {
return systemInfo;
}

@Mock
public Frontend getMySelf() {
return thisFe;
}
};


TabletComputeNodeMapper tabletComputeNodeMapper = new TabletComputeNodeMapper();
tabletComputeNodeMapper.addComputeNode(1L, thisFe.getResourceIsolationGroup());
String otherResourceIsolationGroup = "someothergroup";
Expand Down Expand Up @@ -191,12 +191,12 @@ public TabletComputeNodeMapper internalTabletMapper() {
mgr.initDefaultWarehouse();

LakeTablet arbitraryTablet = new LakeTablet(1001L);
Assert.assertEquals(Set.of(1L), mgr.getAllComputeNodeIdsAssignToTablet(
WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));
Assert.assertEquals(Set.of(1L),
mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));

thisFe.setResourceIsolationGroup(otherResourceIsolationGroup);
Assert.assertEquals(Set.of(2L), mgr.getAllComputeNodeIdsAssignToTablet(
WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));
Assert.assertEquals(Set.of(2L),
mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));

// Check that WarehouseManager.getAllComputeNodeIdsAssignToTablet delegates to
// systemInfo.getAvailableComputeNodeIds.
Expand Down Expand Up @@ -365,7 +365,10 @@ public RunMode getCurrentRunMode() {
MaterializedIndex index = new MaterializedIndex(1, MaterializedIndex.IndexState.NORMAL);
ErrorReportException ex = Assert.assertThrows(ErrorReportException.class,
() -> scanNode.addScanRangeLocations(partition, partition, index, Collections.emptyList(), 1));
Assert.assertEquals("No alive backend or compute node in warehouse null.", ex.getMessage());
Assert.assertEquals(
"No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource " +
"isolation group matching the FE.",
ex.getMessage());
}

private OlapScanNode newOlapScanNode() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,15 @@
import org.junit.Before;
import org.junit.Test;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class TabletComputeNodeMapperTest {
private Frontend thisFe;

@Before
public void setUp() {
thisFe = new Frontend();
Expand All @@ -66,6 +68,17 @@ public void setUp() {
public void tearDown() {
}

@Test
public void modifyComputeNodeEdgeCases() throws Exception {
Long arbitraryTablet = 9000L;
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Assert.assertEquals(0, mapper.numResourceIsolationGroups());
Assert.assertEquals(Collections.emptyList(), mapper.computeNodesForTablet(arbitraryTablet, 1, ""));
mapper.modifyComputeNode(1L, "", "");
Assert.assertEquals(1, mapper.numResourceIsolationGroups());
Assert.assertEquals(List.of(1L), mapper.computeNodesForTablet(arbitraryTablet, 1, ""));
}

@Test
public void testGroupManagementEdgeCase() throws Exception {
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Expand All @@ -75,7 +88,6 @@ public void testGroupManagementEdgeCase() throws Exception {
Assert.assertTrue(mapper.trackingNonDefaultResourceIsolationGroup());
}


@Test
public void testGroupManagement() throws Exception {
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Expand Down Expand Up @@ -143,7 +155,6 @@ public void testTabletToCnMapping() throws Exception {

Assert.assertEquals(2, mapper.numResourceIsolationGroups());


int tabletsToTry = 10000;
long[] tabletIdToGroup2Primary = new long[tabletsToTry];
long[] tabletIdToGroup2Backup = new long[tabletsToTry];
Expand Down

0 comments on commit dbccef4

Please sign in to comment.