Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DYN-6666] Improvements to workspace checksum that is needed for Dynamo ML data pipeline. #15010

Merged
merged 3 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/DynamoCore/Configuration/GraphChecksumItem.cs
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
using System;
using System.Collections.ObjectModel;
using Dynamo.Core;
using Dynamo.Properties;
using System.Collections.Generic;

namespace Dynamo.Configuration
{
/// <summary>
/// Represents the stringified version of the nodes connections from a graph
/// </summary>
[Obsolete("This property is not needed anymore in the preference settings and can be removed in a future version of Dynamo.")]
public class GraphChecksumItem
{
public string GraphId { get; set; }

public string Checksum { get; set; }
}

/// <summary>
/// Represents the stringified version of the nodes connections from a graph
/// </summary>
public class GraphChecksumPair
reddyashish marked this conversation as resolved.
Show resolved Hide resolved
{
public string GraphId { get; set; }

public List<string> Checksum { get; set; }
}
}
1 change: 1 addition & 0 deletions src/DynamoCore/Configuration/IPreferences.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ public interface IPreferences
/// <param name="value">Active state to set</param>
void SetIsBackgroundPreviewActive(string name, bool value);

[Obsolete("This property is not needed anymore in the preference settings and can be removed in a future version of Dynamo.")]
/// <summary>
/// Return a list of GraphChecksumItems
/// </summary>
Expand Down
1 change: 1 addition & 0 deletions src/DynamoCore/Configuration/PreferenceSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ public bool DisableTrustWarnings
/// <summary>
/// Return a list of GraphChecksumItems
/// </summary>
[Obsolete("This property is not needed anymore in the preference settings and can be removed in a future version of Dynamo.")]
public List<GraphChecksumItem> GraphChecksumItemsList { get; set; }

// This function is used to deserialize the trusted locations manually
Expand Down
15 changes: 15 additions & 0 deletions src/DynamoCore/Models/DynamoModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,17 @@ internal LuceneSearchUtility LuceneUtility
}
}

/// <summary>
/// Return a dictionary of GraphChecksumItems.
/// Key will be the workspace guid and its value will be a list of saved checksums(sha256 hash) for that workspace.
/// </summary>
internal Dictionary<string, List<string>> GraphChecksumDictionary { get; set; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain why we need both of these data structures?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, seems like if you used JSON you could just serialize this directly and skip the list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes using JSON serialization now. #15116


/// <summary>
/// Return a list of GraphChecksumItems
/// </summary>
public List<GraphChecksumPair> GraphChecksumList { get; set; }

#endregion

#region static properties
Expand Down Expand Up @@ -979,6 +990,10 @@ protected DynamoModel(IStartConfiguration config)
{
LuceneUtility.DisposeWriter();
}

GraphChecksumList = new List<GraphChecksumPair>();
GraphChecksumDictionary = new Dictionary<string, List<string>>();

// This event should only be raised at the end of this method.
DynamoReady(new ReadyParams(this));
}
Expand Down
8 changes: 8 additions & 0 deletions src/DynamoCore/PublicAPI.Unshipped.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ Dynamo.Configuration.GraphChecksumItem.Checksum.set -> void
Dynamo.Configuration.GraphChecksumItem.GraphChecksumItem() -> void
Dynamo.Configuration.GraphChecksumItem.GraphId.get -> string
Dynamo.Configuration.GraphChecksumItem.GraphId.set -> void
Dynamo.Configuration.GraphChecksumPair
Dynamo.Configuration.GraphChecksumPair.Checksum.get -> System.Collections.Generic.List<string>
Dynamo.Configuration.GraphChecksumPair.Checksum.set -> void
Dynamo.Configuration.GraphChecksumPair.GraphChecksumPair() -> void
Dynamo.Configuration.GraphChecksumPair.GraphId.get -> string
Dynamo.Configuration.GraphChecksumPair.GraphId.set -> void
Dynamo.Configuration.GroupStyleItem
Dynamo.Configuration.GroupStyleItem.GroupStyleItem() -> void
Dynamo.Configuration.PreferenceSettings
Expand Down Expand Up @@ -1864,6 +1870,8 @@ Dynamo.Models.DynamoModel.ExtensionManager.get -> Dynamo.Extensions.IExtensionMa
Dynamo.Models.DynamoModel.ForceRun() -> void
Dynamo.Models.DynamoModel.ForceRunCancelCommand
Dynamo.Models.DynamoModel.ForceRunCancelCommand.ForceRunCancelCommand(bool showErrors, bool cancelRun) -> void
Dynamo.Models.DynamoModel.GraphChecksumList.get -> System.Collections.Generic.List<Dynamo.Configuration.GraphChecksumPair>
Dynamo.Models.DynamoModel.GraphChecksumList.set -> void
Dynamo.Models.DynamoModel.HostVersion.get -> string
Dynamo.Models.DynamoModel.HostVersion.set -> void
Dynamo.Models.DynamoModel.InsertFileCommand
Expand Down
85 changes: 66 additions & 19 deletions src/DynamoCoreWpf/ViewModels/Core/DynamoViewModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
using System.Windows.Forms;
using System.Windows.Media;
using System.Windows.Threading;
using System.Xml;
using System.Xml.Serialization;
using Dynamo.Configuration;
using Dynamo.Core;
using Dynamo.Engine;
Expand Down Expand Up @@ -67,6 +69,7 @@ public partial class DynamoViewModel : ViewModelBase, IDynamoViewModel
private Point transformOrigin;
private bool showStartPage = false;
private PreferencesViewModel preferencesViewModel;
private string dynamoMLDataPath = string.Empty;

// Can the user run the graph
private bool CanRunGraph => HomeSpace.RunSettings.RunEnabled && !HomeSpace.GraphRunInProgress;
Expand Down Expand Up @@ -768,11 +771,26 @@ protected DynamoViewModel(StartConfiguration startConfiguration)
model.ComputeModelDeserialized += model_ComputeModelDeserialized;
model.RequestNotification += model_RequestNotification;

preferencesViewModel = new PreferencesViewModel(this);
preferencesViewModel = new PreferencesViewModel(this);

dynamoMLDataPath = Path.Combine(Model.PathManager.UserDataDirectory, "DynamoMLDataPipeline.xml");

if (!DynamoModel.IsTestMode && !DynamoModel.IsHeadless)
{
model.State = DynamoModel.DynamoModelState.StartedUI;

// deserialize workspace checksum hashes that is used for Dynamo ML data pipeline.
var checksums = new List<GraphChecksumPair>();
var serializer = new XmlSerializer(Model.GraphChecksumList.GetType());

if (File.Exists(dynamoMLDataPath))
{
using (var reader = XmlReader.Create(dynamoMLDataPath))
{
checksums = (List<GraphChecksumPair>)serializer.Deserialize(reader);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this should be wrapped in a try catch, what if the data is corrupt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added exception handling. Addressed here: #15116

}
Model.GraphChecksumDictionary = checksums.ToDictionary(x => x.GraphId, x => x.Checksum);
}
}

FileTrustViewModel = new FileTrustWarningViewModel();
Expand Down Expand Up @@ -2192,30 +2210,58 @@ internal bool CanSaveAs(object parameters)
}

/// <summary>
/// Indicates if the graph has been changed substantially bearing in mind the connections of its nodes and store the checksum value of the graph in the preferences to later comparison
/// Indicates if the workspace has been changed based on node connections and store the checksum value of the graph.
/// </summary>
/// <returns></returns>
private bool HasSubstantialCheckSum()
private bool HasDifferentialCheckSum()
{
bool substantialChecksum = false;
bool differentialChecksum = false;
string graphId = Model.CurrentWorkspace.Guid.ToString();

GraphChecksumItem checksumItem = PreferenceSettings.GraphChecksumItemsList.Where(i => i.GraphId == graphId).FirstOrDefault();
if (checksumItem != null)

Model.GraphChecksumDictionary.TryGetValue(graphId, out List<string> checksums);

// compare the current checksum with previous hash values.
if (checksums != null)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How large can this list get?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are only serializing unique checksums of saved workspaces(on closing the workspace). I think the idea is turn this feature off once we have enough data to train the ML model. If the feature is off, the checksums are not calculated or serialized.

{
if (checksumItem.Checksum != currentWorkspaceViewModel.Checksum)
if (!checksums.Contains(currentWorkspaceViewModel.CurrentCheckSum))
{
PreferenceSettings.GraphChecksumItemsList.Remove(checksumItem);
PreferenceSettings.GraphChecksumItemsList.Add(new GraphChecksumItem() { GraphId = graphId, Checksum = currentWorkspaceViewModel.Checksum });
substantialChecksum = true;
checksums.Add(currentWorkspaceViewModel.CurrentCheckSum);
Model.GraphChecksumDictionary.Remove(graphId);
Model.GraphChecksumDictionary.Add(graphId, checksums);
differentialChecksum = true;
}
}
else
{
PreferenceSettings.GraphChecksumItemsList.Add(new GraphChecksumItem() { GraphId = graphId, Checksum = currentWorkspaceViewModel.Checksum });
substantialChecksum = true;
Model.GraphChecksumDictionary.Add(graphId, new List<string>() { currentWorkspaceViewModel.CurrentCheckSum });
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there ever more than one item in this list?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, each workspace can have a list of unique checksums.

differentialChecksum = true;
}

// if the checksum is different from previous hashes, serialize this new info.
if (differentialChecksum)
{
var graphChecksums = new List<GraphChecksumPair>();
foreach (KeyValuePair<string, List<string>> entry in Model.GraphChecksumDictionary)
{
var item = new GraphChecksumPair
{
GraphId = entry.Key,
Checksum = entry.Value
};

graphChecksums.Add(item);
}

var serializer = new XmlSerializer(Model.GraphChecksumList.GetType());
reddyashish marked this conversation as resolved.
Show resolved Hide resolved
using (var writer = XmlWriter.Create(dynamoMLDataPath))
{
Model.GraphChecksumList = graphChecksums;
serializer.Serialize(writer, Model.GraphChecksumList);
}
}
return substantialChecksum;

return differentialChecksum;
}

private void InternalSaveAs(string path, SaveContext saveContext, bool isBackup = false)
Expand All @@ -2239,13 +2285,14 @@ private void InternalSaveAs(string path, SaveContext saveContext, bool isBackup
{
AddToRecentFiles(path);

if ((currentWorkspaceViewModel?.IsHomeSpace ?? true) && HomeSpace.HasRunWithoutCrash && Model.CurrentWorkspace.IsValidForFDX && IsMLDataIngestionPipelineinBeta && currentWorkspaceViewModel.Checksum != string.Empty)
if ((currentWorkspaceViewModel?.IsHomeSpace ?? true) && HomeSpace.HasRunWithoutCrash &&
Model.CurrentWorkspace.IsValidForFDX && !IsMLDataIngestionPipelineinBeta && currentWorkspaceViewModel.Checksum != string.Empty)
{
Model.Logger.Log("The Workspace is valid for FDX");
Model.Logger.Log("The Workspace id is : " + currentWorkspaceViewModel.Model.Guid.ToString());
Model.Logger.Log("The Workspace checksum is : " + currentWorkspaceViewModel.Checksum);
Model.Logger.Log("The Workspace has Substantial checksum, so is ready to send to FDX : " + HasSubstantialCheckSum().ToString());
MLDataPipelineExtension.DynamoMLDataPipeline.DataExchange(path);
if (HasDifferentialCheckSum())
{
Model.Logger.Log("This Workspace is shared to train the Dynamo Machine Learning model.");
MLDataPipelineExtension.DynamoMLDataPipeline.DataExchange(path);
}
}
}
}
Expand Down
60 changes: 22 additions & 38 deletions src/DynamoCoreWpf/ViewModels/Core/WorkspaceViewModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
using System.Collections.Specialized;
using System.ComponentModel;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Windows;
Expand Down Expand Up @@ -363,57 +362,42 @@ public bool IsHomeSpace
[JsonIgnore]
internal JObject JsonRepresentation { get; set; }

[JsonIgnore]
internal string CurrentCheckSum { get; set; }

/// <summary>
/// Returns the stringified representation of the connected nodes
/// Returns the stringified representation of the node connections in the workspace.
/// </summary>
[JsonIgnore]
public string Checksum
{
get
{
List<string> nodeInfoConnections = new List<string>();
JObject jsonWorkspace = JsonRepresentation;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you get rid of JsonRepresentation? - I think it may have been added just for this method...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is used here:

JsonRepresentation = JObject.Parse(saveContent);

var nodes = jsonWorkspace["Nodes"];
var connectors = Connectors;
reddyashish marked this conversation as resolved.
Show resolved Hide resolved

List<string> nodeIds = new List<string>();
foreach (JObject node in nodes)
foreach (var connector in Connectors)
{
var nodeProperties = node.Children<JProperty>();
JProperty id = nodeProperties.FirstOrDefault(x => x.Name == "Id");
nodeIds.Add(id.Value.ToString());
}

nodeIds.Sort();

foreach (string nodeId in nodeIds)
{
List<string> outputIds = new List<string>();
var node = jsonWorkspace["Nodes"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault();
var outputsProperty = node.Children<JProperty>().FirstOrDefault(x => x.Name == "Outputs");
var outputs = (JArray)outputsProperty.Value;
int outputIndex = 1;

foreach (JObject output in outputs)
{
var outputProperties = output.Children<JProperty>();
JProperty outputId = outputProperties.FirstOrDefault(x => x.Name == "Id");
outputIds.Add(outputId.Value.ToString());
var connectorModel = connector.ConnectorModel;

var connectorsProperty = jsonWorkspace["Connectors"].Where(t => t.Value<string>("Start") == outputId.Value.ToString());
var startingPort= connectorModel.Start;
var endingPort = connectorModel.End;

foreach (var connector in connectorsProperty)
{
var connectorProperties = connector.Children<JProperty>();
JProperty endProperty = connectorProperties.FirstOrDefault(x => x.Name == "End");
string inputId = (String)endProperty.Value;
// node info connections has a unique id in the format: startnodeid[outputindex]endnodeid[outputindex].
nodeInfoConnections.Add(startingPort.Owner.AstIdentifierGuid + "[" + startingPort.Index.ToString() + "]" + endingPort.Owner.AstIdentifierGuid + "[" + endingPort.Index.ToString() + "]");
Copy link
Member

@mjkkirschner mjkkirschner Apr 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is still bizarre to me - I guess you want to base your hash data only on the connections of the graph and no other properties, and in this case sha256 is a good has to choose as any small change in the input will create a big change in the output.

What I was imagining originally was that you could just use a hash that generates collisions - like SHA1 or MD5 - then just use the current JSON of the entire graph, but that is much less controllable than the solution you have here - depends on what you need exactly.

}

var outputConnectedNode = GetNodeByInputId(inputId, jsonWorkspace);
nodeInfoConnections.Add(nodeId + "|[" + outputIndex.ToString() + "|" + outputConnectedNode.Item1 + "|" + outputConnectedNode.Item2.ToString() + "]");
}
outputIndex++;
}
if (nodeInfoConnections.Count > 0)
{
var checksumhash = Hash.ToSha256String(String.Join(",", nodeInfoConnections));
CurrentCheckSum = checksumhash;
return checksumhash;
}
else
{
CurrentCheckSum = string.Empty;
return string.Empty;
}
return nodeInfoConnections.Count > 0 ? string.Join(",", nodeInfoConnections) : string.Empty;
}
}

Expand Down
16 changes: 16 additions & 0 deletions src/DynamoUtilities/Hash.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,22 @@ internal static string ToBase32String(byte[] input, bool addPadding = false)

return result;
}


// converts the string into a sha 256 hash.
internal static string ToSha256String(string s)
{
using var mySHA256 = SHA256.Create();
reddyashish marked this conversation as resolved.
Show resolved Hide resolved

byte[] bytes = mySHA256.ComputeHash(Encoding.UTF8.GetBytes(s));
var sb = new StringBuilder();

for (int i = 0; i < bytes.Length; i++)
{
sb.Append(bytes[i].ToString("x2"));
}
return sb.ToString();
}
}
}

Expand Down
22 changes: 5 additions & 17 deletions test/DynamoCoreTests/Configuration/PreferenceSettingsTests.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
using System;
using System.Collections.Generic;
using System.IO;
using Dynamo.Configuration;
using Dynamo.Models;
using NUnit.Framework;
using System.Linq;
using System;
using Dynamo.Interfaces;
using System.Reflection;
using Dynamo.Configuration;
using Dynamo.Interfaces;
using Dynamo.Models;
using Dynamo.Utilities;
using NUnit.Framework;

namespace Dynamo.Tests.Configuration
{
Expand Down Expand Up @@ -358,18 +358,6 @@ PreferencesComparison comparePrefenceSettings(PreferenceSettings defaultSettings
propertiesWithDifferentValue.Add(destinationPi.Name);
}
}
else if (destinationPi.PropertyType == typeof(List<GraphChecksumItem>))
{
if (((List<GraphChecksumItem>)sourcePi.GetValue(newGeneralSettings, null)).Count ==
((List<GraphChecksumItem>)destinationPi.GetValue(defaultSettings, null)).Count)
{
propertiesWithSameValue.Add(destinationPi.Name);
}
else
{
propertiesWithDifferentValue.Add(destinationPi.Name);
}
}
else
{
if (newValue?.ToString() == oldValue?.ToString())
Expand Down
Loading
Loading