Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into ehigham/let-multipl…
Browse files Browse the repository at this point in the history
…e-bindings
  • Loading branch information
ehigham committed Nov 15, 2023
2 parents 11af6fe + e9e8e17 commit 39d2e4c
Show file tree
Hide file tree
Showing 86 changed files with 1,700 additions and 1,294 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ hail/python/pinned-requirements.txt: hail/python/hailtop/pinned-requirements.txt
hail/python/dev/pinned-requirements.txt: hail/python/pinned-requirements.txt hail/python/dev/requirements.txt
./generate-linux-pip-lockfile.sh hail/python/dev

benchmark/python/pinned-requirements.txt: benchmark/python/requirements.txt
benchmark/python/pinned-requirements.txt: benchmark/python/requirements.txt hail/python/pinned-requirements.txt hail/python/dev/pinned-requirements.txt
./generate-linux-pip-lockfile.sh benchmark/python

gear/pinned-requirements.txt: hail/python/pinned-requirements.txt hail/python/dev/pinned-requirements.txt hail/python/hailtop/pinned-requirements.txt gear/requirements.txt
Expand Down
103 changes: 80 additions & 23 deletions batch/batch/driver/instance_collection/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from hailtop import aiotools
from hailtop.utils import periodically_call, secret_alnum_string, time_msecs

from ...globals import INSTANCE_VERSION
from ...globals import INSTANCE_VERSION, live_instance_states
from ...instance_config import QuantifiedResource
from ..instance import Instance
from ..location import CloudLocationMonitor
Expand Down Expand Up @@ -70,7 +70,7 @@ def choose_location(
regions: List[str],
machine_type: str,
) -> str:
if self._default_region in regions and self.global_total_provisioned_cores_mcpu // 1000 < 1_000:
if self._default_region in regions and self.global_live_cores_mcpu // 1000 < 1_000:
regions = [self._default_region]
return self.location_monitor.choose_location(
cores, local_ssd_data_disk, data_disk_size_gb, preemptible, regions, machine_type
Expand All @@ -88,19 +88,25 @@ def name_instance(self):
return result

@property
def global_total_provisioned_cores_mcpu(self):
return sum(inst_coll.all_versions_provisioned_cores_mcpu for inst_coll in self.name_inst_coll.values())
def global_total_n_instances(self):
return sum(inst_coll.all_versions_total_n_instances for inst_coll in self.name_inst_coll.values())

@property
def global_current_version_live_free_cores_mcpu(self):
return sum(
inst_coll.current_worker_version_stats.live_free_cores_mcpu for inst_coll in self.name_inst_coll.values()
)
def global_total_cores_mcpu(self):
return sum(inst_coll.all_versions_total_cores_mcpu for inst_coll in self.name_inst_coll.values())

@property
def global_live_n_instances(self):
return sum(inst_coll.all_versions_live_n_instances for inst_coll in self.name_inst_coll.values())

@property
def global_live_cores_mcpu(self):
return sum(inst_coll.all_versions_live_cores_mcpu for inst_coll in self.name_inst_coll.values())

@property
def global_current_version_live_schedulable_free_cores_mcpu(self):
def global_current_version_active_schedulable_free_cores_mcpu(self):
return sum(
inst_coll.current_worker_version_stats.live_schedulable_free_cores_mcpu
inst_coll.current_worker_version_stats.active_schedulable_free_cores_mcpu
for inst_coll in self.name_inst_coll.values()
)

Expand All @@ -111,6 +117,25 @@ def global_n_instances_by_state(self) -> Counter[str]:
collections.Counter(),
)

@property
def global_cores_mcpu_by_state(self) -> Counter[str]:
return sum(
(inst_coll.all_versions_cores_mcpu_by_state for inst_coll in self.name_inst_coll.values()),
collections.Counter(),
)

@property
def global_schedulable_n_instances(self) -> int:
return sum(pool.current_worker_version_stats.n_instances_by_state['active'] for pool in self.pools.values())

@property
def global_schedulable_cores_mcpu(self) -> int:
return sum(pool.current_worker_version_stats.cores_mcpu_by_state['active'] for pool in self.pools.values())

@property
def global_schedulable_free_cores_mcpu(self) -> int:
return sum(pool.current_worker_version_stats.active_schedulable_free_cores_mcpu for pool in self.pools.values())

def get_inst_coll(self, inst_coll_name):
return self.name_inst_coll.get(inst_coll_name)

Expand Down Expand Up @@ -140,34 +165,31 @@ async def get_token_from_instance_name(self, name):
class InstanceCollectionStats:
def __init__(self):
self.n_instances_by_state = {'pending': 0, 'active': 0, 'inactive': 0, 'deleted': 0}
self.cores_mcpu_by_state = {'pending': 0, 'active': 0, 'inactive': 0, 'deleted': 0}

self.live_free_cores_mcpu_by_region: Dict[str, int] = collections.defaultdict(int)
# pending and active
self.live_free_cores_mcpu = 0
self.live_total_cores_mcpu = 0
self.live_schedulable_free_cores_mcpu = 0
self.active_schedulable_free_cores_mcpu = 0

def remove_instance(self, instance: Instance):
self.n_instances_by_state[instance.state] -= 1
self.cores_mcpu_by_state[instance.state] -= instance.cores_mcpu

if instance.state in ('pending', 'active'):
self.live_free_cores_mcpu -= instance.free_cores_mcpu_nonnegative
self.live_total_cores_mcpu -= instance.cores_mcpu
if instance.state in live_instance_states:
self.live_free_cores_mcpu_by_region[instance.region] -= instance.free_cores_mcpu_nonnegative

if instance.state == 'active':
self.live_schedulable_free_cores_mcpu -= instance.free_cores_mcpu_nonnegative
self.active_schedulable_free_cores_mcpu -= instance.free_cores_mcpu_nonnegative

def add_instance(self, instance: Instance):
self.n_instances_by_state[instance.state] += 1
self.cores_mcpu_by_state[instance.state] += instance.cores_mcpu

if instance.state in ('pending', 'active'):
self.live_free_cores_mcpu += instance.free_cores_mcpu_nonnegative
self.live_total_cores_mcpu += instance.cores_mcpu
if instance.state in live_instance_states:
self.live_free_cores_mcpu_by_region[instance.region] += instance.free_cores_mcpu_nonnegative

if instance.state == 'active':
self.live_schedulable_free_cores_mcpu += instance.free_cores_mcpu_nonnegative
self.active_schedulable_free_cores_mcpu += instance.free_cores_mcpu_nonnegative


class InstanceCollection:
Expand Down Expand Up @@ -220,8 +242,43 @@ def all_versions_instances_by_state(self):
)

@property
def all_versions_provisioned_cores_mcpu(self):
return sum(version_stats.live_total_cores_mcpu for version_stats in self.stats_by_instance_version.values())
def all_versions_cores_mcpu_by_state(self):
return sum(
(
collections.Counter(version_stats.cores_mcpu_by_state)
for version_stats in self.stats_by_instance_version.values()
),
collections.Counter(),
)

@property
def all_versions_total_n_instances(self):
return sum(
sum(version_stats.n_instances_by_state.values())
for version_stats in self.stats_by_instance_version.values()
)

@property
def all_versions_live_n_instances(self):
return sum(
version_stats.n_instances_by_state[state]
for version_stats in self.stats_by_instance_version.values()
for state in live_instance_states
)

@property
def all_versions_total_cores_mcpu(self):
return sum(
sum(version_stats.cores_mcpu_by_state.values()) for version_stats in self.stats_by_instance_version.values()
)

@property
def all_versions_live_cores_mcpu(self):
return sum(
version_stats.cores_mcpu_by_state[state]
for version_stats in self.stats_by_instance_version.values()
for state in live_instance_states
)

@property
def n_instances(self) -> int:
Expand Down
2 changes: 1 addition & 1 deletion batch/batch/driver/instance_collection/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ async def create_instances(self):

log.info(
f'{self} n_instances {self.n_instances} {pool_stats.n_instances_by_state}'
f' free_cores {free_cores} live_free_cores {pool_stats.live_free_cores_mcpu / 1000}'
f' active_schedulable_free_cores {pool_stats.active_schedulable_free_cores_mcpu / 1000}'
f' full_job_queue_ready_cores {sum(ready_cores_mcpu_per_user.values()) / 1000}'
f' head_job_queue_ready_cores {sum(head_job_queue_ready_cores_mcpu.values()) / 1000}'
)
Expand Down
12 changes: 9 additions & 3 deletions batch/batch/driver/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,11 +448,17 @@ async def get_index(request, userdata):
'pools': inst_coll_manager.pools.values(),
'jpim': jpim,
'instance_id': app['instance_id'],
'n_instances_by_state': inst_coll_manager.global_n_instances_by_state,
'global_total_n_instances': inst_coll_manager.global_total_n_instances,
'global_total_cores_mcpu': inst_coll_manager.global_total_cores_mcpu,
'global_live_n_instances': inst_coll_manager.global_live_n_instances,
'global_live_cores_mcpu': inst_coll_manager.global_live_cores_mcpu,
'global_n_instances_by_state': inst_coll_manager.global_n_instances_by_state,
'global_cores_mcpu_by_state': inst_coll_manager.global_cores_mcpu_by_state,
'global_schedulable_n_instances': inst_coll_manager.global_schedulable_n_instances,
'global_schedulable_cores_mcpu': inst_coll_manager.global_schedulable_cores_mcpu,
'global_schedulable_free_cores_mcpu': inst_coll_manager.global_schedulable_free_cores_mcpu,
'instances': inst_coll_manager.name_instance.values(),
'ready_cores_mcpu': ready_cores_mcpu,
'total_provisioned_cores_mcpu': inst_coll_manager.global_total_provisioned_cores_mcpu,
'live_schedulable_free_cores_mcpu': inst_coll_manager.global_current_version_live_schedulable_free_cores_mcpu,
'frozen': app['frozen'],
'feature_flags': app['feature_flags'],
}
Expand Down
101 changes: 63 additions & 38 deletions batch/batch/driver/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,73 +49,98 @@ <h1>Feature Flags</h1>

<h1>Instance Collections</h1>

<h2>Pools</h2>
<table class="data-table" id="pools">
<table class="data-table" id="inst_colls">
<thead>
<tr>
<th>Name</th>
<th>Worker Type</th>
<th>Preemptible</th>
<th colspan="1">Name</th>
<th colspan="4">Instances</th>
<th></th>
<th colspan="4">Cores</th>
<th></th>
<th colspan="3">Schedulable Cores</th>
</tr>
<tr>
<th></th>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
<th></th>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
<th>Provisioned Cores</th>
<th>Schedulable Cores</th>
<th></th>
<th>Free</th>
<th>Total</th>
<th>% Free</th>
</tr>
</thead>
<tbody>
{% for pool in pools %}
<tr>
<td><a href="{{ base_path }}/inst_coll/pool/{{ pool.name }}">{{ pool.name }}</a></td>
<td>{{ pool.worker_type }}</td>
<td>{{ pool.preemptible }}</td>
<td class="numeric-cell">{{ pool.all_versions_instances_by_state['pending'] }}</td>
<td class="numeric-cell">{{ pool.all_versions_instances_by_state['active'] }}</td>
<td class="numeric-cell">{{ pool.all_versions_instances_by_state['inactive'] }}</td>
<td class="numeric-cell">{{ pool.all_versions_instances_by_state['deleted'] }}</td>
<td class="numeric-cell">{{ pool.all_versions_provisioned_cores_mcpu / 1000 }}</td>
<td class="numeric-cell">{{ pool.current_worker_version_stats.live_schedulable_free_cores_mcpu / 1000 }} / {{ pool.all_versions_provisioned_cores_mcpu / 1000 }} </td>
<td></td>
<td class="numeric-cell">{{ pool.all_versions_cores_mcpu_by_state['pending'] / 1000 }}</td>
<td class="numeric-cell">{{ pool.all_versions_cores_mcpu_by_state['active'] / 1000 }}</td>
<td class="numeric-cell">{{ pool.all_versions_cores_mcpu_by_state['inactive'] / 1000 }}</td>
<td class="numeric-cell">{{ pool.all_versions_cores_mcpu_by_state['deleted'] / 1000 }}</td>
<td></td>
<td class="numeric-cell">{{ pool.current_worker_version_stats.active_schedulable_free_cores_mcpu / 1000 }}</td>
<td class="numeric-cell">{{ pool.current_worker_version_stats.cores_mcpu_by_state['active'] / 1000 }}</td>
{% if pool.current_worker_version_stats.cores_mcpu_by_state['active'] != 0 %}
<td class="numeric-cell">{{ (pool.current_worker_version_stats.active_schedulable_free_cores_mcpu * 100 / pool.current_worker_version_stats.cores_mcpu_by_state['active']) | round(1)}}%</td>
{% else %}
<td class="numeric-cell"></td>
{% endif %}
</tr>
{% endfor %}
</tbody>
</table>

<h2>Job Private Instance Manager</h2>
<table class="data-table" id="job-private-inst-manager">
<thead>
<tr>
<th>Name</th>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
<th>Provisioned Cores</th>
<th>Schedulable Cores</th>
</tr>
</thead>
<tbody>
<tr>
<td><a href="{{ base_path }}/inst_coll/jpim">{{ jpim.name }}</a></td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['pending'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['active'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['inactive'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['deleted'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_provisioned_cores_mcpu / 1000 }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.live_schedulable_free_cores_mcpu / 1000 }} / {{ jpim.all_versions_provisioned_cores_mcpu / 1000 }} </td>
<td></td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['pending'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['active'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['inactive'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['deleted'] / 1000 }}</td>
<td></td>
<td class="numeric-cell"></td>
<td class="numeric-cell"></td>
<td class="numeric-cell"></td>
</tr>
</tbody>
<tfoot>
<tr>
<td>Total</td>
<td class="numeric-cell">{{ global_n_instances_by_state['pending'] }}</td>
<td class="numeric-cell">{{ global_n_instances_by_state['active'] }}</td>
<td class="numeric-cell">{{ global_n_instances_by_state['inactive'] }}</td>
<td class="numeric-cell">{{ global_n_instances_by_state['deleted'] }}</td>
<td></td>
<td class="numeric-cell">{{ global_cores_mcpu_by_state['pending'] / 1000 }}</td>
<td class="numeric-cell">{{ global_cores_mcpu_by_state['active'] / 1000 }}</td>
<td class="numeric-cell">{{ global_cores_mcpu_by_state['inactive'] / 1000 }}</td>
<td class="numeric-cell">{{ global_cores_mcpu_by_state['deleted'] / 1000 }}</td>
<td></td>
<td class="numeric-cell">{{ global_schedulable_free_cores_mcpu / 1000 }}</td>
<td class="numeric-cell">{{ global_schedulable_cores_mcpu / 1000 }}</td>
{% if global_schedulable_cores_mcpu != 0 %}
<td class="numeric-cell">{{ (global_schedulable_free_cores_mcpu * 100 / global_schedulable_cores_mcpu) | round(1)}}%</td>
{% else %}
<td class="numeric-cell"></td>
{% endif %}
</tr>
</tfoot>
</table>

<h1>Instances</h1>
<div class="attributes">
<div>Pending: {{ n_instances_by_state['pending'] }}</div>
<div>Active: {{ n_instances_by_state['active'] }}</div>
<div>Inactive: {{ n_instances_by_state['inactive'] }}</div>
<div>Deleted: {{ n_instances_by_state['deleted'] }}</div>
<div>Total provisioned cores: {{ total_provisioned_cores_mcpu / 1000 }}</div>
<div>Total schedulable cores: {{ live_schedulable_free_cores_mcpu / 1000 }} / {{ total_provisioned_cores_mcpu / 1000 }}</div>
</div>
<table class="data-table" id="instances">
<thead>
<tr>
Expand Down
43 changes: 27 additions & 16 deletions batch/batch/driver/templates/job_private.html
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,35 @@ <h2>Status</h2>
</div>
<table class="data-table" id="status">
<thead>
<tr>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
<th>Live Total Cores</th>
<th>Live Free Cores</th>
</tr>
<tr>
<th colspan="4">Instances</th>
<th></th>
<th colspan="4">Cores</th>
</tr>
<tr>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
<th></th>
<th>Pending</th>
<th>Active</th>
<th>Inactive</th>
<th>Deleted</th>
</tr>
</thead>
<tbody>
<tr>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.n_instances_by_state['pending'] }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.n_instances_by_state['active'] }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.n_instances_by_state['inactive'] }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.n_instances_by_state['deleted'] }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.live_total_cores_mcpu / 1000 }}</td>
<td class="numeric-cell">{{ jpim.current_worker_version_stats.live_free_cores_mcpu / 1000 }}</td>
</tr>
<tr>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['pending'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['active'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['inactive'] }}</td>
<td class="numeric-cell">{{ jpim.all_versions_instances_by_state['deleted'] }}</td>
<td></td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['pending'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['active'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['inactive'] / 1000 }}</td>
<td class="numeric-cell">{{ jpim.all_versions_cores_mcpu_by_state['deleted'] / 1000 }}</td>
</tr>
</tbody>
</table>

Expand Down
Loading

0 comments on commit 39d2e4c

Please sign in to comment.