Skip to content

Commit

Permalink
Merge pull request #169 from valory-xyz/feat/healthcheck
Browse files Browse the repository at this point in the history
feat: add healthcheck
  • Loading branch information
jmoreira-valory authored Jun 20, 2024
2 parents f7fc8c6 + 8ab47f1 commit b23a3fe
Show file tree
Hide file tree
Showing 11 changed files with 324 additions and 43 deletions.
29 changes: 0 additions & 29 deletions .github/workflows/e2e.yml

This file was deleted.

2 changes: 1 addition & 1 deletion electron/install.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const { BrewScript } = require('./scripts');
* - use "" (nothing as a suffix) for latest release candidate, for example "0.1.0rc26"
* - use "alpha" for alpha release, for example "0.1.0rc26-alpha"
*/
const OlasMiddlewareVersion = '0.1.0rc50';
const OlasMiddlewareVersion = '0.1.0rc54';
const OperateDirectory = `${os.homedir()}/.operate`;
const VenvDir = `${OperateDirectory}/venv`;
const TempDir = `${OperateDirectory}/temp`;
Expand Down
2 changes: 1 addition & 1 deletion frontend/constants/serviceTemplates.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { ServiceTemplate } from '@/client';
export const SERVICE_TEMPLATES: ServiceTemplate[] = [
{
name: 'Trader Agent',
hash: 'bafybeieg45wcjcwd5znuwpjcp5scfhgdqwpfq43pzaare6nwvmy5bb56cm',
hash: 'bafybeicihxhw2djlsuoy2eji3g2tmasipfqe3rwsfvzbd2j3tvphqvp7aa',
description: 'Trader agent for omen prediction markets',
image:
'https://operate.olas.network/_next/image?url=%2Fimages%2Fprediction-agent.png&w=3840&q=75',
Expand Down
38 changes: 33 additions & 5 deletions operate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def create_app( # pylint: disable=too-many-locals, unused-argument, too-many-st
logger = setup_logger(name="operate")
operate = OperateApp(home=home, logger=logger)
funding_jobs: t.Dict[str, asyncio.Task] = {}
healthcheck_jobs: t.Dict[str, asyncio.Task] = {}

# Create shutdown endpoint
shutdown_endpoint = uuid.uuid4().hex
Expand All @@ -171,6 +172,22 @@ def schedule_funding_job(
)
)

def schedule_healthcheck_job(
service: str,
) -> None:
"""Schedule a healthcheck job."""
logger.info(f"Starting healthcheck job for {service}")
if service in healthcheck_jobs:
logger.info(f"Cancelling existing healthcheck_jobs job for {service}")
cancel_healthcheck_job(service=service)

loop = asyncio.get_running_loop()
healthcheck_jobs[service] = loop.create_task(
operate.service_manager().healthcheck_job(
hash=service,
)
)

def cancel_funding_job(service: str) -> None:
"""Cancel funding job."""
if service not in funding_jobs:
Expand All @@ -179,11 +196,11 @@ def cancel_funding_job(service: str) -> None:
if not status:
logger.info(f"Funding job cancellation for {service} failed")

def pause_all_services_on_startup():
logger.info(f"stopping services on startup")
services = [i["hash"] for i in operate.service_manager().json]
def pause_all_services_on_startup() -> None:
logger.info("Stopping services on startup...")
service_hashes = [i["hash"] for i in operate.service_manager().json]

for service in services:
for service in service_hashes:
if not operate.service_manager().exists(service=service):
continue
deployment = operate.service_manager().create_or_load(service).deployment
Expand All @@ -193,7 +210,15 @@ def pause_all_services_on_startup():
deployment.stop(force=True)
logger.info(f"Cancelling funding job for {service}")
cancel_funding_job(service=service)
logger.info(f"stopping services on startup: done")
logger.info("Stopping services on startup done.")

def cancel_healthcheck_job(service: str) -> None:
"""Cancel healthcheck job."""
if service not in healthcheck_jobs:
return
status = healthcheck_jobs[service].cancel()
if not status:
logger.info(f"Healthcheck job cancellation for {service} failed")

# on backend app started we assume there are now started agents, so we force to pause all
pause_all_services_on_startup()
Expand Down Expand Up @@ -525,6 +550,7 @@ async def _create_services(request: Request) -> JSONResponse:
manager.fund_service(hash=service.hash)
manager.deploy_service_locally(hash=service.hash)
schedule_funding_job(service=service.hash)
schedule_healthcheck_job(service=service.hash)

return JSONResponse(
content=operate.service_manager().create_or_load(hash=service.hash).json
Expand All @@ -548,6 +574,7 @@ async def _update_services(request: Request) -> JSONResponse:
manager.fund_service(hash=service.hash)
manager.deploy_service_locally(hash=service.hash)
schedule_funding_job(service=service.hash)
schedule_healthcheck_job(service=service.hash)

return JSONResponse(content=service.json)

Expand Down Expand Up @@ -657,6 +684,7 @@ async def _start_service_locally(request: Request) -> JSONResponse:
manager.fund_service(hash=service)
manager.deploy_service_locally(hash=service, force=True)
schedule_funding_job(service=service)
schedule_healthcheck_job(service=service.hash)
return JSONResponse(content=manager.create_or_load(service).deployment)

@app.post("/api/services/{service}/deployment/stop")
Expand Down
39 changes: 39 additions & 0 deletions operate/services/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import aiohttp # type: ignore
from aea.helpers.base import IPFSHash
from aea.helpers.logging import setup_logger
from autonomy.chain.base import registry_contracts
Expand Down Expand Up @@ -56,6 +57,18 @@
KEYS_JSON = "keys.json"
DOCKER_COMPOSE_YAML = "docker-compose.yaml"
SERVICE_YAML = "service.yaml"
HTTP_OK = 200


async def check_service_health() -> bool:
"""Check the service health"""
async with aiohttp.ClientSession() as session:
async with session.get("http://localhost:8716/healthcheck") as resp:
status = resp.status
response_json = await resp.json()
return status == HTTP_OK and response_json.get(
"is_transitioning_fast", False
)


class ServiceManager:
Expand Down Expand Up @@ -901,6 +914,32 @@ async def funding_job(
)
await asyncio.sleep(60)

async def healthcheck_job(
self,
hash: str,
) -> None:
"""Start a background funding job."""
failed_health_checks = 0

while True:
try:
# Check the service health
healthy = await check_service_health()
# Restart the service if the health failed 5 times in a row
if not healthy:
failed_health_checks += 1
else:
failed_health_checks = 0
if failed_health_checks >= 4:
self.stop_service_locally(hash=hash)
self.deploy_service_locally(hash=hash)

except Exception: # pylint: disable=broad-except
logging.info(
f"Error occured while checking the service health\n{traceback.format_exc()}"
)
await asyncio.sleep(30)

def deploy_service_locally(self, hash: str, force: bool = True) -> Deployment:
"""
Deploy service locally
Expand Down
2 changes: 1 addition & 1 deletion operate/services/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ def swap( # pylint: disable=too-many-arguments,too-many-locals
key_file = Path(temp_dir, "key.txt")
key_file.write_text(owner_key, encoding="utf-8")
owner_crypto = EthereumCrypto(private_key_path=str(key_file))
owner_cryptos: list[EthereumCrypto] = [owner_crypto]
owner_cryptos: t.List[EthereumCrypto] = [owner_crypto]
owners = [
manager.ledger_api.api.to_checksum_address(owner_crypto.address)
for owner_crypto in owner_cryptos
Expand Down
2 changes: 1 addition & 1 deletion operate/services/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ def start(self, use_docker: bool = False) -> None:
self.status = DeploymentStatus.DEPLOYED
self.store()

def stop(self, use_docker: bool = False, force: bool=False) -> None:
def stop(self, use_docker: bool = False, force: bool = False) -> None:
"""Stop the deployment."""
if self.status != DeploymentStatus.DEPLOYED and not force:
return
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"main": "electron/main.js",
"name": "olas-operate-app",
"productName": "Pearl",
"version": "0.1.0-rc50",
"version": "0.1.0-rc54",
"dependencies": {
"@ant-design/cssinjs": "^1.18.4",
"@ant-design/icons": "^5.3.0",
Expand Down
Loading

0 comments on commit b23a3fe

Please sign in to comment.