Skip to content

Commit

Permalink
prov/shm: Add unmap_region function
Browse files Browse the repository at this point in the history
This function is mainly for the niche case where on progress_connreq
a peer is added to the map with its region needing to be mapped, and
then after mapping it, it's discovered that the newly mapped peer's
process died. In this case we need to unmap them and free any resources
that were opened for communicating with them.

Signed-off-by: Zach Dworkin <[email protected]>
  • Loading branch information
zachdworkin committed Sep 4, 2024
1 parent ad3a40c commit dfc9e23
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 26 deletions.
7 changes: 1 addition & 6 deletions prov/shm/src/smr_progress.c
Original file line number Diff line number Diff line change
Expand Up @@ -891,13 +891,8 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd)
if (peer_smr->pid != (int) cmd->msg.hdr.data) {
/* TODO track and update/complete in error any transfers
* to or from old mapping
*
* TODO create smr_unmap_region
* this needs to close peer_smr->map->peers[idx].pid_fd
* This case will also return an unmapped region because the idx
* is valid but the region was unmapped
*/
munmap(peer_smr, peer_smr->total_size);
smr_unmap_region(&smr_prov, ep->region->map, idx);
smr_map_to_region(&smr_prov, ep->region->map, idx);
peer_smr = smr_peer_region(ep->region, idx);
}
Expand Down
57 changes: 37 additions & 20 deletions prov/shm/src/smr_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,42 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id)
return;
}

void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map,
int64_t peer_id)
{
struct smr_region *peer_region = map->peers[peer_id].region;
struct smr_peer *peer = &map->peers[peer_id];
struct util_ep *util_ep;
struct smr_ep *smr_ep;
struct smr_av *av;
int ret = 0;

ofi_spin_lock(&map->lock);
av = container_of(map, struct smr_av, smr_map);
dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep,
av_entry) {
smr_ep = container_of(util_ep, struct smr_ep, util_ep);
smr_unmap_from_endpoint(smr_ep->region, peer_id);
}

if (map->flags & SMR_FLAG_HMEM_ENABLED) {
ret = ofi_hmem_host_unregister(peer_region);
if (ret)
FI_WARN(prov, FI_LOG_EP_CTRL,
"unable to unregister shm with iface\n");

if (peer->pid_fd != -1) {
close(peer->pid_fd);
peer->pid_fd = -1;
}
}

munmap(peer_region, peer_region->total_size);
peer_region = NULL;
peer->region = NULL;
ofi_spin_unlock(&map->lock);
}

void smr_unmap_from_endpoint(struct smr_region *region, int64_t id)
{
struct smr_region *peer_smr;
Expand Down Expand Up @@ -550,33 +586,14 @@ void smr_map_del(struct smr_map *map, int64_t id)

assert(id >= 0 && id < SMR_MAX_PEERS);

pthread_mutex_lock(&ep_list_lock);
entry = dlist_find_first_match(&ep_name_list, smr_match_name,
smr_no_prefix(map->peers[id].peer.name));
pthread_mutex_unlock(&ep_list_lock);

smr_unmap_region(&smr_prov, map, id);
ofi_spin_lock(&map->lock);
(void) ofi_rbmap_find_delete(&map->rbmap,
(void *) map->peers[id].peer.name);

map->peers[id].fiaddr = FI_ADDR_NOTAVAIL;
map->peers[id].peer.id = -1;
map->num_peers--;

if (!map->peers[id].region)
goto unlock;

if (!entry) {
if (map->flags & SMR_FLAG_HMEM_ENABLED) {
if (map->peers[id].pid_fd != -1)
close(map->peers[id].pid_fd);

(void) ofi_hmem_host_unregister(map->peers[id].region);
}
munmap(map->peers[id].region, map->peers[id].region->total_size);
map->peers[id].region = NULL;
}
unlock:
ofi_spin_unlock(&map->lock);
}

Expand Down
2 changes: 2 additions & 0 deletions prov/shm/src/smr_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ void smr_cleanup(void);
int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map,
int64_t id);
void smr_map_to_endpoint(struct smr_region *region, int64_t id);
void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map,
int64_t id);
void smr_unmap_from_endpoint(struct smr_region *region, int64_t id);
void smr_exchange_all_peers(struct smr_region *region);
int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
Expand Down

0 comments on commit dfc9e23

Please sign in to comment.