From 80fff0771512182fa3fd48ea361d293aba04bca4 Mon Sep 17 00:00:00 2001 From: Doug Stephen Date: Tue, 5 Mar 2024 11:10:19 -0600 Subject: [PATCH] fix(hue): Address various nil field index issues The majority of these fixes are concurrency races: Due to Hue's heavy usage of additional Cosock coroutines, it can be the case that functions that have been passed references to device records will be operating on a device record for a deleted device. This can cause unexpected nil index errors and crashes. We attempt to minimize that here with heuristics; in particular, a device that has been deleted will return `nil` for *any* of its table keys. So we check a field that we *know* a device record should have, its `.id` field. We wrap all handlers in a closure that does a sanity check on the status of the device record using this heuristic, and we wrap the handler itself in a pcall. We also address some one-off locations where nil field indexes can occur for other reasons, and introduce some additional defensiveness around other places where we've observed nil index crashes in crash metrics. --- drivers/SmartThings/philips-hue/src/disco.lua | 2 +- .../SmartThings/philips-hue/src/hue/api.lua | 19 ++- drivers/SmartThings/philips-hue/src/init.lua | 140 +++++++++++------- 3 files changed, 104 insertions(+), 57 deletions(-) diff --git a/drivers/SmartThings/philips-hue/src/disco.lua b/drivers/SmartThings/philips-hue/src/disco.lua index 99c9a87519..a433e77c6e 100644 --- a/drivers/SmartThings/philips-hue/src/disco.lua +++ b/drivers/SmartThings/philips-hue/src/disco.lua @@ -262,7 +262,7 @@ function HueDiscovery.search_bridge_for_supported_devices(driver, bridge_id, api local parent_device_id = device.parent_device_id or device:get_field(Fields.PARENT_DEVICE_ID) or "" local parent_bridge_device = driver:get_device_info(parent_device_id) local is_child_of_bridge = parent_bridge_device and (parent_bridge_device:get_field(Fields.BRIDGE_ID) == bridge_id) - if parent_bridge_device and is_child_of_bridge and not not_known_to_bridge then + if parent_bridge_device and is_child_of_bridge and not not_known_to_bridge and device.id then device.log.info(string.format("Device is no longer joined to Hue Bridge %q, deleting", parent_bridge_device.label)) driver:do_hue_light_delete(device) end diff --git a/drivers/SmartThings/philips-hue/src/hue/api.lua b/drivers/SmartThings/philips-hue/src/hue/api.lua index 4116715879..388b7d0bb0 100644 --- a/drivers/SmartThings/philips-hue/src/hue/api.lua +++ b/drivers/SmartThings/philips-hue/src/hue/api.lua @@ -29,9 +29,20 @@ local ControlMessageBuilders = { end } +local function try_send(instance, message) + if not instance._ctrl_tx then + log.error(st_utils.stringify_table(message, "Couldn't send the followings due to closed transmit channel", false)) + end + + local success, err = pcall(instance._ctrl_tx:send(message)) + if not success then + log.error(string.format("Failed to transmit Hue Control Message: %s", err)) + end +end + local function do_shutdown(instance) if instance._running then - instance._ctrl_tx:send(ControlMessageBuilders.Shutdown()) + try_send(instance, ControlMessageBuilders.Shutdown()) instance._running = false end end @@ -178,7 +189,7 @@ end function PhilipsHueApi:update_connection(hub_base_url, api_key) local msg = ControlMessageBuilders.Update(hub_base_url, api_key) - self._ctrl_tx:send(msg) + try_send(self, msg) end ---@return table|nil response REST response, nil if error @@ -187,7 +198,7 @@ local function do_get(instance, path) local reply_tx, reply_rx = channel.new() reply_rx:settimeout(10) local msg = ControlMessageBuilders.Get(path, reply_tx); - instance._ctrl_tx:send(msg) + try_send(instance, msg) local recv, err = reply_rx:receive() if err ~= nil then instance.client:close_socket() @@ -202,7 +213,7 @@ local function do_put(instance, path, payload) local reply_tx, reply_rx = channel.new() reply_rx:settimeout(10) local msg = ControlMessageBuilders.Put(path, payload, reply_tx); - instance._ctrl_tx:send(msg) + try_send(instance, msg) local recv, err = reply_rx:receive() if err ~= nil then instance.client:close_socket() diff --git a/drivers/SmartThings/philips-hue/src/init.lua b/drivers/SmartThings/philips-hue/src/init.lua index 9be833cd0e..23bca6c320 100644 --- a/drivers/SmartThings/philips-hue/src/init.lua +++ b/drivers/SmartThings/philips-hue/src/init.lua @@ -51,6 +51,31 @@ local DEFAULT_MIREK = 153 -- "forward declare" some functions local _initialize, bridge_added, light_added, init_light, init_bridge +local function safe_wrap_handler(handler) + return function(driver, device, ...) + if device == nil or (device and device.id == nil) then + log.warn("Tried to handle capability command for device that has been deleted") + return + end + local success, result = pcall(handler, driver, device, ...) + if not success then + log.error_with({ hub_logs = true }, string.format("Failed to invoke capability command handler. Reason: %s", result)) + end + return result + end +end + +local disco = Discovery.discover +local added = safe_wrap_handler(_initialize) +local init = safe_wrap_handler(_initialize) + +local refresh_handler = safe_wrap_handler(handlers.refresh_handler) +local switch_on_handler = safe_wrap_handler(handlers.switch_on_handler) +local switch_off_handler = safe_wrap_handler(handlers.switch_off_handler) +local switch_level_handler = safe_wrap_handler(handlers.switch_level_handler) +local set_color_handler = safe_wrap_handler(handlers.set_color_handler) +local set_color_temp_handler = safe_wrap_handler(handlers.set_color_temp_handler) + ---@param light_device HueChildDevice ---@param light table local function emit_light_status_events(light_device, light) @@ -260,7 +285,8 @@ local function spawn_bridge_add_api_key_task(driver, device) repeat local time_remaining = math.max(0, timeout_time - cosock.socket.gettime()) if time_remaining == 0 then - device.log.error_with({ hub_logs = true }, + local _log = device.log or log + _log.error_with({ hub_logs = true }, string.format( "Link button not pressed or API key not received for bridge \"%s\" after 30 seconds, sleeping then trying again in a few minutes.", device.label @@ -694,7 +720,7 @@ light_added = function(driver, device, parent_device_id, resource_id) driver.light_id_to_device[device_light_resource_id] = device -- the refresh handler adds lights that don't have a fully initialized bridge to a queue. - handlers.refresh_handler(driver, device) + refresh_handler(driver, device) end local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key) @@ -726,7 +752,8 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key end local child_device_map = {} local children = bridge_device:get_child_list() - bridge_device.log.debug(string.format("Scanning connectivity of %s child devices", #children)) + local _log = bridge_device.log or log + _log.debug(string.format("Scanning connectivity of %s child devices", #children)) for _, device_record in ipairs(children) do local hue_device_id = device_record:get_field(Fields.HUE_DEVICE_ID) if hue_device_id ~= nil then @@ -762,24 +789,28 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key if connectivity_status.data and #connectivity_status.data > 0 then scanned = true - end - - for _, status in ipairs(connectivity_status.data) do - local hue_device_id = (status.owner and status.owner.rid) or "" - log.trace(string.format("Checking connectivity status for device resource id %s", hue_device_id)) - local child_device = child_device_map[hue_device_id] - if child_device then - if status.status == "connected" then - child_device.log.info_with({hub_logs=true}, "Marking Online after SSE Reconnect") - child_device:online() - child_device:set_field(Fields.IS_ONLINE, true) - elseif status.status == "connectivity_issue" then - child_device.log.info_with({hub_logs=true}, "Marking Offline after SSE Reconnect") - child_device:set_field(Fields.IS_ONLINE, false) - child_device:offline() + for _, status in ipairs(connectivity_status.data) do + local hue_device_id = (status.owner and status.owner.rid) or "" + log.trace(string.format("Checking connectivity status for device resource id %s", hue_device_id)) + local child_device = child_device_map[hue_device_id] + if child_device then + if not child_device.id then + child_device_map[hue_device_id] = nil + else + if status.status == "connected" then + child_device.log.info_with({hub_logs=true}, "Marking Online after SSE Reconnect") + child_device:online() + child_device:set_field(Fields.IS_ONLINE, true) + elseif status.status == "connectivity_issue" then + child_device.log.info_with({hub_logs=true}, "Marking Offline after SSE Reconnect") + child_device:set_field(Fields.IS_ONLINE, false) + child_device:offline() + end + end end end end + ::continue:: end end, string.format("Hue Bridge %s Zigbee Scan Task", bridge_device.label)) @@ -825,7 +856,7 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key light_resource_id = driver.device_rid_to_light_rid[update_data.owner.rid] end local light_device = driver.light_id_to_device[light_resource_id] - if light_device ~= nil then + if light_device ~= nil and light_device.id ~= nil then driver.emit_light_status_events(light_device, update_data) end end @@ -834,7 +865,7 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key if delete_data.type == "light" then local light_resource_id = delete_data.id local light_device = driver.light_id_to_device[light_resource_id] - if light_device ~= nil then + if light_device ~= nil and light_device.id ~= nil then log.info( string.format( "Light device \"%s\" was deleted from hue bridge %s", @@ -860,7 +891,8 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key cosock.spawn(function() local hue_api = bridge_device:get_field(Fields.BRIDGE_API) if hue_api == nil then - bridge_device.log.warn("No Hue API instance available for new light event.") + local _log = bridge_device.log or log + _log.warn("No Hue API instance available for new light event.") return end @@ -978,7 +1010,7 @@ local function do_bridge_network_init(driver, bridge_device, bridge_url, api_key local bridge_id = light_device.parent_device_id or bridge_device:get_field(Fields.PARENT_DEVICE_ID) if bridge_id == bridge_device.id then table.insert(ids_to_remove, id) - handlers.refresh_handler(driver, light_device) + refresh_handler(driver, light_device) end end for _, id in ipairs(ids_to_remove) do @@ -1110,7 +1142,7 @@ init_light = function(driver, device) end device:set_field(Fields._INIT, true, { persist = false }) if device:get_field(Fields._REFRESH_AFTER_INIT) then - handlers.refresh_handler(driver, device) + refresh_handler(driver, device) device:set_field(Fields._REFRESH_AFTER_INIT, false, { persist = true }) end end @@ -1291,23 +1323,25 @@ cosock.spawn(function() return end - for _, light in ipairs(light_resource.data or {}) do - local light_resource_description = { - hue_provided_name = device_data.metadata.name, - id = light.id, - on = light.on, - color = light.color, - dimming = light.dimming, - color_temperature = light.color_temperature, - mode = light.mode, - parent_device_id = bridge_device_uuid, - hue_device_id = light.owner.rid, - hue_device_data = device_data - } - if not Discovery.light_state_disco_cache[light.id] then - log.info(string.format("Caching previously unknown light service description for %s", - device_data.metadata.name)) - Discovery.light_state_disco_cache[light.id] = light_resource_description + if light_resource.data and #light_resource.data > 0 then + for _, light in ipairs(light_resource.data) do + local light_resource_description = { + hue_provided_name = device_data.metadata.name, + id = light.id, + on = light.on, + color = light.color, + dimming = light.dimming, + color_temperature = light.color_temperature, + mode = light.mode, + parent_device_id = bridge_device_uuid, + hue_device_id = light.owner.rid, + hue_device_data = device_data + } + if not Discovery.light_state_disco_cache[light.id] then + log.info(string.format("Caching previously unknown light service description for %s", + device_data.metadata.name)) + Discovery.light_state_disco_cache[light.id] = light_resource_description + end end end @@ -1401,15 +1435,6 @@ cosock.spawn(function() end end, "Stray Hue Bulb Resolution Task") -local disco = Discovery.discover -local added = _initialize -local init = _initialize - -local refresh_handler = handlers.refresh_handler -local switch_on_handler = handlers.switch_on_handler -local switch_off_handler = handlers.switch_off_handler -local switch_level_handler = handlers.switch_level_handler -local set_color_temp_handler = handlers.set_color_temp_handler local function remove(driver, device) driver.datastore.dni_to_device_id[device.device_network_id] = nil @@ -1483,7 +1508,7 @@ local hue = Driver("hue", [capabilities.switchLevel.commands.setLevel.NAME] = switch_level_handler, }, [capabilities.colorControl.ID] = { - [capabilities.colorControl.commands.setColor.NAME] = handlers.set_color_handler, + [capabilities.colorControl.commands.setColor.NAME] = set_color_handler, }, [capabilities.colorTemperature.ID] = { [capabilities.colorTemperature.commands.setColorTemperature.NAME] = set_color_temp_handler, @@ -1499,10 +1524,21 @@ local hue = Driver("hue", api_key_to_bridge_id = {}, stray_bulb_tx = stray_bulb_tx, _lights_pending_refresh = {}, - emit_light_status_events = emit_light_status_events, + emit_light_status_events = function(light_device, light_table) + if light_device == nil or (light_device and light_device.id == nil) then + log.warn("Tried to emit light status event for device that has been deleted") + return + end + local success, result = pcall(emit_light_status_events, light_device, light_table) + if not success then + log.error_with({ hub_logs = true }, string.format("Failed to invoke emit light status handler. Reason: %s", result)) + end + return result + end, do_hue_light_delete = function(driver, device) if type(driver.try_delete_device) ~= "function" then - device.log.warn("Requesting device delete on API version that doesn't support it. Marking device offline.") + local _log = device.log or log + _log.warn("Requesting device delete on API version that doesn't support it. Marking device offline.") device:offline() return end