From d9f14f5cf96918ff7f8350be53884d6d8385b923 Mon Sep 17 00:00:00 2001 From: John Haddon Date: Thu, 7 Dec 2023 15:18:36 +0000 Subject: [PATCH] Switch : Fix context used when managing internal connection This came to light due to a CI failure on `main` : ``` Error: FAIL: testInvalidColumns (GafferImageUITest.CatalogueUITest.CatalogueUITest) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/gaffer/gaffer/build/python/GafferImageUITest/CatalogueUITest.py", line 58, in tearDown GafferUITest.TestCase.tearDown( self ) File "/__w/gaffer/gaffer/build/python/GafferUITest/TestCase.py", line 88, in tearDown GafferTest.TestCase.tearDown( self ) File "/__w/gaffer/gaffer/build/python/GafferTest/TestCase.py", line 119, in tearDown self.fail( f"Unexpected message : {message}" ) AssertionError: Unexpected message : ERROR : Emitting signal : Unknown error ``` Investigation showed that the "Unknown error" was in fact an `IECore::Cancelled` exception, and that the signal in question was `plugInputChangedSignal()` on a Switch node inside a Text node inside an InternalImage node inside a Catalogue node. The Catalogue node was being destroyed, and as the Text node was torn down, its internal connections were modified and `Switch::inputChanged()` was called on the internal Switch node. This was all happening in a background thread whose identity I havent ascertained, and where the current context was cancelled. The destruction was actually being triggered from `IECorePython::WrapperGarbageCollector::collect()`, which is why I believe the problem only occurred on `main` - collection needs to occur at precisely the right time for the problem to show, and is highly sensitive to how many wrappers have been created previously. So what was the problem exactly (other than that garbage collection is a horribly unpredictable compromise we'd rather avoid)? In `Switch::inputChanged()` we make a shortcutting internal connection across the Switch if we can determine that the index is not context-sensitive, which we check using `PlugAlgo::dependsOnCompute()`. When the index isn't context sensitive, we call `indexPlug()->getValue()` to get its current value and then make the appropriate connection. But in this case, the index plug (which is an IntPlug) is driven by a BoolPlug, which while still not context-sensitive, _will_ require type conversion work in `getValue()`. That work is done by ValuePlug inside a Process, and the Process checks for cancellation as a matter of course. It's a bit hard to know exactly where to point the finger of blame here. It's arguable that we don't need to wrap the type conversion in a Process. But we currently do because we think it's possible that some conversions might be expensive and therefore warrant monitoring (and caching). It's also arguable that `dependsOnCompute()` should return `True` when a type conversion is involved, and that what we really should mean by "Compute" is "something involving a Process". But that's hard to explain, and would defeat the Switch's internal optimisation in cases like this where it is completely valid. Instead I've blamed the poor old `Switch` node, and fixed it by scoping a default context in `updateInternalConnection()`. This is much more limited in scope than other potential fixes, and therefore much less likely to have unintended side effects. But I think the _real_ culprit is the garbage collection. It's occuring at times completely out of our control, disposing of nodes on threads we don't want them disposed on, and emitting signals on threads we don't want them emitted on. But fixing that is far far _far_ harder. --- python/GafferTest/SwitchTest.py | 34 +++++++++++++++++++++++++++++++++ src/Gaffer/Switch.cpp | 10 +++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/python/GafferTest/SwitchTest.py b/python/GafferTest/SwitchTest.py index 76a6d0a5c31..9e14bdc7974 100644 --- a/python/GafferTest/SwitchTest.py +++ b/python/GafferTest/SwitchTest.py @@ -528,6 +528,40 @@ def testNestedPlugs( self ) : self.assertEqual( s.correspondingInput( s["out"]["value"]["x"] ), s["in"][0]["value"]["x"] ) self.assertEqual( s.activeInPlug( s["out"]["value"]["y"] ), s["in"][0]["value"]["y"] ) + def testInternalConnectionWithTypeConversionAndCanceller( self ) : + + # Make a switch with 2 inputs. + + switch = Gaffer.Switch() + switch.setup( Gaffer.IntPlug() ) + switch["in"][0].setInput( self.intPlug( 0 ) ) + switch["in"][1].setInput( self.intPlug( 1 ) ) + + # Drive the index with a BoolPlug. This means that `indexPlug()->getValue()` + # will perform a type conversion using `ValuePlug::setFrom()`, performed + # inside a `Process` which will check for cancellation in its constructor. + + boolPlug = Gaffer.BoolPlug() + switch["index"].setInput( boolPlug ) + + # Change the index by setting the value of `boolPlug`, but do this in a + # Context in which cancellation has been requested. This models a bizarre + # condition in which garbage collection destroys a `GafferImage.Shape` node + # from within a cancelled background task, and `Switch::plugInputChanged()` + # is called as the Shape is destroyed (Shape nodes have a bool->index connection + # internally). + # + # We do not want this to throw `IECore::Cancelled` because the graph authoring + # API (here, `setValue()`) is not context-sensitive, so it would be surprising + # if it considered the canceller. + + canceller = IECore.Canceller() + canceller.cancel() + with Gaffer.Context( Gaffer.Context(), canceller ) : + for index in ( 0, 1 ) : + boolPlug.setValue( index ) + self.assertTrue( switch["out"].getInput().isSame( switch["in"][index] ) ) + def setUp( self ) : GafferTest.TestCase.setUp( self ) diff --git a/src/Gaffer/Switch.cpp b/src/Gaffer/Switch.cpp index 7df75bff5f9..f2d874e5c13 100644 --- a/src/Gaffer/Switch.cpp +++ b/src/Gaffer/Switch.cpp @@ -55,6 +55,8 @@ namespace const IECore::InternedString g_inPlugsName( "in" ); const IECore::InternedString g_outPlugName( "out" ); +ConstContextPtr g_defaultContext = new Context; + } // namespace GAFFER_NODE_DEFINE_TYPE( Switch ); @@ -423,6 +425,12 @@ void Switch::updateInternalConnection() return; } - Plug *in = const_cast( oppositePlug( out, Context::current() ) ); + // The context is irrelevant since we've already checked that `indexPlug()` + // and `enabledPlug()` aren't computed. But we need to pass _something_ non-null + // because that is how we tell `oppositePlug()` we want it to take the index + // into account. And we need to scope this default context too - see + // `SwitchTest.testInternalConnectionWithTypeConversionAndCanceller()`. + Context::Scope scope( g_defaultContext.get() ); + Plug *in = const_cast( oppositePlug( out, g_defaultContext.get() ) ); out->setInput( in ); }