Skip to content

Commit

Permalink
Merge branch '6/edge' into status-reporting-shard-side
Browse files Browse the repository at this point in the history
  • Loading branch information
MiaAltieri committed Oct 19, 2023
2 parents 706dc40 + 266c07d commit 219b44b
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 69 deletions.
33 changes: 15 additions & 18 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,20 @@ jobs:
run: python3 -m pip install tox
- name: Run tests
run: tox run -e unit
# current lib check is incompatible with bumped APIs - perform this manually. After merge +
# publish. It should be possible to comment this back in and for the workflow to resume normally.
# see issue: https://github.com/canonical/charming-actions/issues/97
# lib-check:
# name: Check libraries
# runs-on: ubuntu-latest
# timeout-minutes: 5
# steps:
# - name: Checkout
# uses: actions/checkout@v3
# with:
# fetch-depth: 0
# - name: Check libs
# uses: canonical/charming-actions/[email protected]
# with:
# credentials: "${{ secrets.CHARMHUB_TOKEN }}" # FIXME: current token will expire in 2023-07-04
# github-token: "${{ secrets.GITHUB_TOKEN }}"
lib-check:
name: Check libraries
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Check libs
uses: canonical/charming-actions/[email protected]
with:
credentials: "${{ secrets.CHARMHUB_TOKEN }}" # FIXME: current token will expire in 2023-07-04
github-token: "${{ secrets.GITHUB_TOKEN }}"

build:
name: Build charms
Expand All @@ -88,7 +85,7 @@ jobs:
needs:
- lint
- unit-test
# - lib-check
- lib-check
- build
runs-on: ubuntu-latest
timeout-minutes: 120
Expand Down
14 changes: 5 additions & 9 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,11 @@ jobs:
uses: actions/checkout@v3
with:
fetch-depth: 0
# current lib publish is incompatible with bumped APIs - perform this manually. After merge +
# publish. It should be possible to comment this back in and for the workflow to resume
# normally.
# see issue: https://github.com/canonical/charming-actions/issues/97
# - name: Release any bumped charm libs
# uses: canonical/charming-actions/[email protected]
# with:
# credentials: "${{ secrets.CHARMHUB_TOKEN }}"
# github-token: "${{ secrets.GITHUB_TOKEN }}"
- name: Release any bumped charm libs
uses: canonical/charming-actions/[email protected]
with:
credentials: "${{ secrets.CHARMHUB_TOKEN }}"
github-token: "${{ secrets.GITHUB_TOKEN }}"
- name: Upload charm to charmhub
uses: canonical/charming-actions/[email protected]
with:
Expand Down
165 changes: 146 additions & 19 deletions lib/charms/mongodb/v1/mongos.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging
from dataclasses import dataclass
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import quote_plus

from charms.mongodb.v0.mongodb import NotReadyError
Expand All @@ -21,7 +21,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2

# path to store mongodb ketFile
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -66,8 +66,8 @@ def uri(self):
)


class RemovePrimaryShardError(Exception):
"""Raised when there is an attempt to remove the primary shard."""
class NotEnoughSpaceError(Exception):
"""Raised when there isn't enough space to movePrimary."""


class ShardNotInClusterError(Exception):
Expand All @@ -78,6 +78,14 @@ class ShardNotPlannedForRemovalError(Exception):
"""Raised when it is expected that a shard is planned for removal, but it is not."""


class NotDrainedError(Exception):
"""Raised when a shard is still being drained."""


class BalancerNotEnabledError(Exception):
"""Raised when balancer process is not enabled."""


class MongosConnection:
"""In this class we create connection object to Mongos.
Expand Down Expand Up @@ -167,34 +175,74 @@ def add_shard(self, shard_name, shard_hosts, shard_port=Config.MONGODB_PORT):
logger.info("Adding shard %s", shard_name)
self.client.admin.command("addShard", shard_url)

def remove_shard(self, shard_name: str) -> None:
"""Removes shard from the cluster.
def pre_remove_checks(self, shard_name):
"""Performs a series of checks for removing a shard from the cluster.
Raises:
ConfigurationError, OperationFailure, NotReadyError,
RemovePrimaryShardError
Raises
ConfigurationError, OperationFailure, NotReadyError, ShardNotInClusterError,
BalencerNotEnabledError
"""
sc_status = self.client.admin.command("listShards")
if shard_name not in self.get_shard_members():
logger.info("Shard to remove is not in cluster.")
raise ShardNotInClusterError(f"Shard {shard_name} could not be removed")

# It is necessary to call removeShard multiple times on a shard to guarantee removal.
# Allow re-removal of shards that are currently draining.
sc_status = self.client.admin.command("listShards")
if self._is_any_draining(sc_status, ignore_shard=shard_name):
cannot_remove_shard = (
f"cannot remove shard {shard_name} from cluster, another shard is draining"
)
logger.error(cannot_remove_shard)
raise NotReadyError(cannot_remove_shard)

databases_using_shard_as_primary = self.get_databases_for_shard(shard_name)
if databases_using_shard_as_primary:
cannot_remove_primary_shard = f"These databases: {', '.join(databases_using_shard_as_primary)}, use Shard {shard_name} is a primary shard, cannot remove shard."
logger.error(cannot_remove_primary_shard)
raise RemovePrimaryShardError(cannot_remove_primary_shard)
# check if enabled sh.getBalancerState()
balancer_state = self.client.admin.command("balancerStatus")
if balancer_state["mode"] != "off":
logger.info("Balancer is enabled, ready to remove shard.")
return

# starting the balancer doesn't guarantee that is is running, wait until it starts up.
logger.info("Balancer process is not running, enabling it.")
self.client.admin.command("balancerStart")
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
with attempt:
balancer_state = self.client.admin.command("balancerStatus")
if balancer_state["mode"] == "off":
raise BalancerNotEnabledError

def remove_shard(self, shard_name: str) -> None:
"""Removes shard from the cluster.
Raises:
ConfigurationError, OperationFailure, NotReadyError, NotEnoughSpaceError,
ShardNotInClusterError, BalencerNotEnabledError
"""
self.pre_remove_checks(shard_name)

# remove shard, process removal status, & check if fully removed
logger.info("Attempting to remove shard %s", shard_name)
removal_info = self.client.admin.command("removeShard", shard_name)
self._log_removal_info(removal_info, shard_name)
remaining_chunks = self._retrieve_remaining_chunks(removal_info)
if remaining_chunks:
logger.info("Waiting for all chunks to be drained from %s.", shard_name)
raise NotDrainedError()

# process removal status
# MongoDB docs says to movePrimary only after all chunks have been drained from the shard.
logger.info("All chunks drained from shard: %s", shard_name)
databases_using_shard_as_primary = self.get_databases_for_shard(shard_name)
if databases_using_shard_as_primary:
logger.info(
"These databases: %s use Shard %s is a primary shard, moving primary.",
", ".join(databases_using_shard_as_primary),
shard_name,
)
self._move_primary(databases_using_shard_as_primary, old_primary=shard_name)

# MongoDB docs says to re-run removeShard after running movePrimary
logger.info("removing shard: %s, after moving primary", shard_name)
removal_info = self.client.admin.command("removeShard", shard_name)
self._log_removal_info(removal_info, shard_name)

def _is_shard_draining(self, shard_name: str) -> bool:
Expand Down Expand Up @@ -272,9 +320,7 @@ def _hostname_from_hostport(hostname: str) -> str:

def _log_removal_info(self, removal_info, shard_name):
"""Logs removal information for a shard removal."""
remaining_chunks = (
removal_info["remaining"]["chunks"] if "remaining" in removal_info else "None"
)
remaining_chunks = self._retrieve_remaining_chunks(removal_info)
dbs_to_move = (
removal_info["dbsToMove"]
if "dbsToMove" in removal_info and removal_info["dbsToMove"] != []
Expand Down Expand Up @@ -317,3 +363,84 @@ def is_shard_aware(self, shard_name: str) -> bool:
return shard["state"] == 1

return False

def _retrieve_remaining_chunks(self, removal_info) -> int:
"""Parses the remaining chunks to remove from removeShard command."""
return removal_info["remaining"]["chunks"] if "remaining" in removal_info else 0

def _move_primary(self, databases_to_move: List[str], old_primary: str) -> None:
"""Moves all the provided databases to a new primary.
Raises:
NotEnoughSpaceError, ConfigurationError, OperationFailure
"""
for database_name in databases_to_move:
db_size = self.get_db_size(database_name, old_primary)
new_shard, avail_space = self.get_shard_with_most_available_space(
shard_to_ignore=old_primary
)
if db_size > avail_space:
no_space_on_new_primary = (
f"Cannot move primary for database: {database_name}, new shard: {new_shard}",
f"does not have enough space. {db_size} > {avail_space}",
)
logger.error(no_space_on_new_primary)
raise NotEnoughSpaceError(no_space_on_new_primary)

# From MongoDB Docs: After starting movePrimary, do not perform any read or write
# operations against any unsharded collection in that database until the command
# completes.
logger.info(
"Moving primary on %s database to new primary: %s. Do NOT write to %s database.",
database_name,
new_shard,
database_name,
)
# This command does not return until MongoDB completes moving all data. This can take
# a long time.
self.client.admin.command("movePrimary", database_name, to=new_shard)
logger.info(
"Successfully moved primary on %s database to new primary: %s",
database_name,
new_shard,
)

def get_db_size(self, database_name, primary_shard) -> int:
"""Returns the size of a DB on a given shard in bytes."""
database = self.client[database_name]
db_stats = database.command("dbStats")

# sharded databases are spread across multiple shards, find the amount of storage used on
# the primary shard
for shard_name, shard_storage_info in db_stats["raw"].items():
# shard names are of the format `shard-one/10.61.64.212:27017`
shard_name = shard_name.split("/")[0]
if shard_name != primary_shard:
continue

return shard_storage_info["storageSize"]

return 0

def get_shard_with_most_available_space(self, shard_to_ignore) -> Tuple[str, int]:
"""Returns the shard in the cluster with the most available space and the space in bytes.
Algorithm used was similar to that used in mongo in `selectShardForNewDatabase`:
https://github.com/mongodb/mongo/blob/6/0/src/mongo/db/s/config/sharding_catalog_manager_database_operations.cpp#L68-L91
"""
candidate_shard = None
candidate_free_space = -1
available_storage = self.client.admin.command("dbStats", freeStorage=1)

for shard_name, shard_storage_info in available_storage["raw"].items():
# shard names are of the format `shard-one/10.61.64.212:27017`
shard_name = shard_name.split("/")[0]
if shard_name == shard_to_ignore:
continue

current_free_space = shard_storage_info["freeStorageSize"]
if current_free_space > candidate_free_space:
candidate_shard = shard_name
candidate_free_space = current_free_space

return (candidate_shard, candidate_free_space)
41 changes: 18 additions & 23 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
from charms.mongodb.v1.helpers import KEY_FILE
from charms.mongodb.v1.mongodb_provider import LEGACY_REL_NAME, REL_NAME
from charms.mongodb.v1.mongos import (
BalancerNotEnabledError,
MongosConnection,
RemovePrimaryShardError,
NotDrainedError,
ShardNotInClusterError,
ShardNotPlannedForRemovalError,
)
Expand All @@ -44,7 +45,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
Expand All @@ -55,10 +56,6 @@ class RemoveLastShardError(Exception):
"""Raised when there is an attempt to remove the last shard in the cluster."""


class NotDrainedError(Exception):
"""Raised when a shard is still in the cluster after removal."""


class ShardingProvider(Object):
"""Manage relations between the config server and the shard, on the config-server's side."""

Expand Down Expand Up @@ -179,7 +176,7 @@ def _on_relation_event(self, event):
return

try:
logger.info("Adding shards not present in cluster.")
logger.info("Adding/Removing shards not present in cluster.")
self.add_shards(departed_relation_id)
self.remove_shards(departed_relation_id)
except NotDrainedError:
Expand All @@ -201,12 +198,10 @@ def _on_relation_event(self, event):

logger.error("Deferring _on_relation_event for shards interface since: error=%r", e)
event.defer()
except RemovePrimaryShardError:
cannot_proceed = (
"Attempt made to remove a primary shard, do not permit other hooks to execute."
)
logger.error(cannot_proceed)
raise
except BalancerNotEnabledError:
logger.error("Deferring on _relation_broken_event, balancer is not enabled.")
event.defer()
return
except (PyMongoError, NotReadyError) as e:
logger.error("Deferring _on_relation_event for shards interface since: error=%r", e)
event.defer()
Expand Down Expand Up @@ -251,15 +246,14 @@ def remove_shards(self, departed_shard_id):
relation_shards = self._get_shards_from_relations(departed_shard_id)

for shard in cluster_shards - relation_shards:
self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}")
logger.info("Attempting to removing shard: %s", shard)
mongo.remove_shard(shard)
logger.info("Shard: %s, is now draining", shard)

if shard in mongo.get_shard_members():
shard_draining_message = f"shard {shard} still exists in cluster after removal, it is still draining."
logger.info(shard_draining_message)
raise NotDrainedError(shard_draining_message)
try:
self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}")
logger.info("Attempting to removing shard: %s", shard)
mongo.remove_shard(shard)
except ShardNotInClusterError:
logger.info(
"Shard to remove is not in sharded cluster. It has been successfully removed."
)

def update_credentials(self, key: str, value: str) -> None:
"""Sends new credentials, for a key value pair across all shards."""
Expand Down Expand Up @@ -404,6 +398,7 @@ def pass_hook_checks(self, event):
return False

if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

Expand All @@ -427,7 +422,7 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
# check if were scaling down and add a log message
if self.charm.is_scaling_down(event.relation.id):
logger.info(
"Relation broken event occurring due to scale down, do not proceed to remove users."
"Relation broken event occurring due to scale down, do not proceed to remove shards."
)
return

Expand Down

0 comments on commit 219b44b

Please sign in to comment.