Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix NPE that occurs during version upgrade with empty/default resource isolation group #14

Merged
merged 3 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/lake/LakeTablet.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ public void getQueryableReplicas(List<Replica> allQuerableReplicas, List<Replica
long visibleVersion, long localBeId, int schemaHash, long warehouseId) {
Set<Long> computeNodeIds = GlobalStateMgr.getCurrentState().getWarehouseMgr()
.getAllComputeNodeIdsAssignToTablet(warehouseId, this);
if (computeNodeIds == null) {
return;
}
for (long backendId : computeNodeIds) {
Replica replica = new Replica(getId(), backendId, visibleVersion, schemaHash, getDataSize(true),
getRowCount(visibleVersion), NORMAL, -1, visibleVersion);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -207,8 +208,10 @@ public Set<Long> getAllComputeNodeIdsAssignToTablet(Long warehouseId, LakeTablet
" warehouse %d", warehouseId));
}
List<Long> computeNodeIds = systemInfoService.internalTabletMapper().computeNodesForTablet(tablet.getId());
if (computeNodeIds == null) {
return null;
if (computeNodeIds == null || computeNodeIds.isEmpty()) {
LOG.warn("no compute nodes assigned to tablet {} in warehouse {}", tablet.getId(),
warehouseId);
return Collections.emptySet();
}
return new HashSet<>(computeNodeIds);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ private void analyzeProperties(Map<String, String> properties, Set<String> suppo
continue;
}
// Support single level location label for now
String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]+\\s*)";
// allow matching an empty val (after the colon)
String regex = "(\\s*[a-z_0-9]+\\s*:\\s*[a-z_0-9]*\\s*)";
if (!Pattern.compile(regex).matcher(propVal).matches()) {
throw new SemanticException("invalid 'location' or 'group' format: " + propVal +
", should be like: 'key:val'");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import com.starrocks.common.util.ConsistentHashRing;
import com.starrocks.common.util.HashRing;
import com.starrocks.server.GlobalStateMgr;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.TestOnly;

import java.util.Collections;
Expand Down Expand Up @@ -65,6 +67,7 @@
*/

public class TabletComputeNodeMapper {
private static final Logger LOG = LogManager.getLogger(TabletComputeNodeMapper.class);

private static final int CONSISTENT_HASH_RING_VIRTUAL_NUMBER = 256;
private static final Long ARBITRARY_FAKE_TABLET = 1L;
Expand Down Expand Up @@ -169,9 +172,12 @@ public void modifyComputeNode(Long computeNodeId,
String oldResourceIsolationGroup, String newResourceIsolationGroup) {
oldResourceIsolationGroup = getResourceIsolationGroupName(oldResourceIsolationGroup);
newResourceIsolationGroup = getResourceIsolationGroupName(newResourceIsolationGroup);
if (oldResourceIsolationGroup.equals(newResourceIsolationGroup)) {
return;
}
// We run the following even if oldResourceIsolationGroup.equals(newResourceIsolationGroup)
// because we want to cleanly handle edge cases where the compute node hasn't already been
// added to the TabletComputeNode mapper. This can happen in at least one situation, which
// is when the cluster is first upgraded to include resource isolation groups.
// Because the host ips match during a CN restart, upstream code which adds the ComputeNodes
// will not execute and therefore we won't call this.addComputeNode.
writeLock.lock();
try {
removeComputeNodeUnsynchronized(computeNodeId, oldResourceIsolationGroup);
Expand All @@ -195,7 +201,9 @@ public List<Long> computeNodesForTablet(Long tabletId, int count, String resourc
readLock.lock();
try {
if (!this.resourceIsolationGroupToTabletMapping.containsKey(resourceIsolationGroup)) {
return null;
LOG.warn(String.format("Requesting node for resource isolation group %s, to which"
+ " there is not a known CN assigned.", resourceIsolationGroup));
return Collections.emptyList();
}
TabletMap m = this.resourceIsolationGroupToTabletMapping.get(resourceIsolationGroup);
return m.tabletToComputeNodeId.get(tabletId, count);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ public NodeMgr getNodeMgr() {
public SystemInfoService getClusterInfo() {
return systemInfo;
}

@Mock
public Frontend getMySelf() {
return thisFe;
}
};


TabletComputeNodeMapper tabletComputeNodeMapper = new TabletComputeNodeMapper();
tabletComputeNodeMapper.addComputeNode(1L, thisFe.getResourceIsolationGroup());
String otherResourceIsolationGroup = "someothergroup";
Expand Down Expand Up @@ -191,12 +191,12 @@ public TabletComputeNodeMapper internalTabletMapper() {
mgr.initDefaultWarehouse();

LakeTablet arbitraryTablet = new LakeTablet(1001L);
Assert.assertEquals(Set.of(1L), mgr.getAllComputeNodeIdsAssignToTablet(
WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));
Assert.assertEquals(Set.of(1L),
mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));

thisFe.setResourceIsolationGroup(otherResourceIsolationGroup);
Assert.assertEquals(Set.of(2L), mgr.getAllComputeNodeIdsAssignToTablet(
WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));
Assert.assertEquals(Set.of(2L),
mgr.getAllComputeNodeIdsAssignToTablet(WarehouseManager.DEFAULT_WAREHOUSE_ID, arbitraryTablet));

// Check that WarehouseManager.getAllComputeNodeIdsAssignToTablet delegates to
// systemInfo.getAvailableComputeNodeIds.
Expand Down Expand Up @@ -365,7 +365,10 @@ public RunMode getCurrentRunMode() {
MaterializedIndex index = new MaterializedIndex(1, MaterializedIndex.IndexState.NORMAL);
ErrorReportException ex = Assert.assertThrows(ErrorReportException.class,
() -> scanNode.addScanRangeLocations(partition, partition, index, Collections.emptyList(), 1));
Assert.assertEquals("No alive backend or compute node in warehouse null.", ex.getMessage());
Assert.assertEquals(
"No alive backend or compute node in warehouse null. Also possible that there are no CN of the resource " +
"isolation group matching the FE.",
ex.getMessage());
}

private OlapScanNode newOlapScanNode() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,15 @@
import org.junit.Before;
import org.junit.Test;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class TabletComputeNodeMapperTest {
private Frontend thisFe;

@Before
public void setUp() {
thisFe = new Frontend();
Expand All @@ -66,6 +68,17 @@ public void setUp() {
public void tearDown() {
}

@Test
public void modifyComputeNodeEdgeCases() throws Exception {
Long arbitraryTablet = 9000L;
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Assert.assertEquals(0, mapper.numResourceIsolationGroups());
Assert.assertEquals(Collections.emptyList(), mapper.computeNodesForTablet(arbitraryTablet, 1, ""));
mapper.modifyComputeNode(1L, "", "");
Assert.assertEquals(1, mapper.numResourceIsolationGroups());
Assert.assertEquals(List.of(1L), mapper.computeNodesForTablet(arbitraryTablet, 1, ""));
}

@Test
public void testGroupManagementEdgeCase() throws Exception {
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Expand All @@ -75,7 +88,6 @@ public void testGroupManagementEdgeCase() throws Exception {
Assert.assertTrue(mapper.trackingNonDefaultResourceIsolationGroup());
}


@Test
public void testGroupManagement() throws Exception {
TabletComputeNodeMapper mapper = new TabletComputeNodeMapper();
Expand Down Expand Up @@ -143,7 +155,6 @@ public void testTabletToCnMapping() throws Exception {

Assert.assertEquals(2, mapper.numResourceIsolationGroups());


int tabletsToTry = 10000;
long[] tabletIdToGroup2Primary = new long[tabletsToTry];
long[] tabletIdToGroup2Backup = new long[tabletsToTry];
Expand Down