Skip to content

Commit

Permalink
Merge pull request #510 from paillave/v
Browse files Browse the repository at this point in the history
redo xml parser
  • Loading branch information
paillave authored Nov 24, 2024
2 parents 0a78e97 + ec0ba1c commit 89fa010
Show file tree
Hide file tree
Showing 12 changed files with 534 additions and 148 deletions.
10 changes: 10 additions & 0 deletions src/Paillave.Etl.XmlFile/Core/IXmlObjectReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using System;
using System.IO;
using System.Threading;

namespace Paillave.Etl.XmlFile.Core;

public interface IXmlObjectReader
{
void Read(Stream fileStream, CancellationToken cancellationToken);
}
2 changes: 0 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/Mapping/XmlFieldDefinition.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Xml;

namespace Paillave.Etl.XmlFile.Core.Mapping
Expand Down
2 changes: 0 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/XmlFileDefinition.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq.Expressions;
using System.Text;
using System.Xml;
using Paillave.Etl.XmlFile.Core.Mapping;

namespace Paillave.Etl.XmlFile.Core
Expand Down
4 changes: 2 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/XmlNodeDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ private void SetFieldDefinition(XmlFieldDefinition xmlFieldDefinition)
var existingFieldDefinition = _xmlFieldDefinitions.FirstOrDefault(i => i.TargetPropertyInfo.Name == xmlFieldDefinition.TargetPropertyInfo.Name);
if (existingFieldDefinition == null)
_xmlFieldDefinitions.Add(xmlFieldDefinition);
else
if (xmlFieldDefinition.NodePath != null) existingFieldDefinition.NodePath = xmlFieldDefinition.NodePath;
else if (xmlFieldDefinition.NodePath != null)
existingFieldDefinition.NodePath = xmlFieldDefinition.NodePath;
}
// public XmlNodeDefinition<T> MapXPathToProperty<TField>(string valueXPathQuery, Expression<Func<T, TField>> memberLambda)
// {
Expand Down
22 changes: 16 additions & 6 deletions src/Paillave.Etl.XmlFile/Core/XmlNodeParsed.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;

namespace Paillave.Etl.XmlFile.Core
{
public class XmlNodeParsed
{
public string SourceName { get; internal set; }
public string NodeDefinitionName { get; internal set; }
public string NodePath { get; internal set; }
public Type Type { get; internal set; }
public object Value { get; internal set; }
public XmlNodeParsed(string sourceName, string nodeDefinitionName, string nodePath, Type type, object value, IDictionary<Type, Guid> correlationKeys)
{
SourceName = sourceName;
NodeDefinitionName = nodeDefinitionName;
NodePath = nodePath;
Type = type;
Value = value;
CorrelationKeys = new ReadOnlyDictionary<Type, Guid>(correlationKeys);
}
public string SourceName { get; }
public string NodeDefinitionName { get; }
public string NodePath { get; }
public Type Type { get; }
public object Value { get; }
public T GetValue<T>() => (T)Value;
// public object[] ParentValues { get; internal set; }
// public T GetValue<T>(int level = 0) => (T)(level == 0 ? Value : ParentValues[level - 1]);
public HashSet<Guid> CorrelationKeys { get; set; } = new HashSet<Guid>();
public ReadOnlyDictionary<Type, Guid> CorrelationKeys { get; }
}
}
261 changes: 128 additions & 133 deletions src/Paillave.Etl.XmlFile/Core/XmlObjectReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,162 +7,157 @@
using System.Threading;
using System.Xml;

namespace Paillave.Etl.XmlFile.Core
namespace Paillave.Etl.XmlFile.Core;
[Obsolete]
public class XmlObjectReader : IXmlObjectReader
{
public class XmlObjectReader
private class XmlReadField
{
private class XmlReadField
{
public XmlFieldDefinition Definition { get; set; }
public IXmlNodeDefinition NodeDefinition { get; set; }
public int Depth { get; set; }
public object Value { get; set; }
}
public XmlFieldDefinition Definition { get; set; }
public IXmlNodeDefinition NodeDefinition { get; set; }
public int Depth { get; set; }
public object Value { get; set; }
}

private HashSet<string> _xmlFieldsDefinitionSearch;
private HashSet<string> _xmlNodesDefinitionSearch;
private HashSet<string> _xmlFieldsDefinitionSearch;
private HashSet<string> _xmlNodesDefinitionSearch;

private readonly List<XmlReadField> _inScopeReadFields = new List<XmlReadField>();
private readonly XmlFileDefinition _xmlFileDefinition;
private readonly List<XmlReadField> _inScopeReadFields = new List<XmlReadField>();
private readonly XmlFileDefinition _xmlFileDefinition;
private readonly string _sourceName;
private readonly Action<XmlNodeParsed> _pushResult;

public XmlObjectReader(XmlFileDefinition xmlFileDefinition)
{
_xmlFileDefinition = xmlFileDefinition;
_xmlNodesDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.Select(i => i.NodePath).Distinct());
_xmlFieldsDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => fd.NodePath)).Distinct());
}
private bool XmlReadFieldShouldBeCleanedUp(XmlReadField xmlReadField, int depth)
{
var depthScope = xmlReadField.Definition.DepthScope;
int depthLimit;
if (depthScope > 0)
depthLimit = depthScope;
else
depthLimit = xmlReadField.Depth + depthScope;
return depth < depthLimit;
}
private void ProcessEndOfAnyNode(Stack<NodeLevel> nodes)
{
foreach (var item in _inScopeReadFields.Where(i => XmlReadFieldShouldBeCleanedUp(i, nodes.Count - 1)).ToList())
_inScopeReadFields.Remove(item);
}
private void ProcessAttributeValue(string key, Stack<NodeLevel> nodes, string stringContent)
public XmlObjectReader(XmlFileDefinition xmlFileDefinition, string sourceName, Action<XmlNodeParsed> pushResult)
{
_xmlFileDefinition = xmlFileDefinition;
this._sourceName = sourceName;
this._pushResult = pushResult;
_xmlNodesDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.Select(i => i.NodePath).Distinct());
_xmlFieldsDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => fd.NodePath)).Distinct());
}
private bool XmlReadFieldShouldBeCleanedUp(XmlReadField xmlReadField, int depth)
{
var depthScope = xmlReadField.Definition.DepthScope;
int depthLimit;
if (depthScope > 0)
depthLimit = depthScope;
else
depthLimit = xmlReadField.Depth + depthScope;
return depth < depthLimit;
}
private void ProcessEndOfAnyNode(Stack<NodeLevel> nodes)
{
foreach (var item in _inScopeReadFields.Where(i => XmlReadFieldShouldBeCleanedUp(i, nodes.Count - 1)).ToList())
_inScopeReadFields.Remove(item);
}
private void ProcessAttributeValue(string key, Stack<NodeLevel> nodes, string stringContent)
{
// string key = $"/{string.Join("/", nodes.Reverse())}";
if (!_xmlFieldsDefinitionSearch.Contains(key)) return;
var fds = _xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => new { Fd = fd, Nd = nd })).Where(i => i.Fd.NodePath == key).ToList();
if (string.IsNullOrWhiteSpace(stringContent))
{
// string key = $"/{string.Join("/", nodes.Reverse())}";
if (!_xmlFieldsDefinitionSearch.Contains(key)) return;
var fds = _xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => new { Fd = fd, Nd = nd })).Where(i => i.Fd.NodePath == key).ToList();
if (string.IsNullOrWhiteSpace(stringContent))
foreach (var fd in fds)
{
foreach (var fd in fds)
_inScopeReadFields.Add(new XmlReadField
{
_inScopeReadFields.Add(new XmlReadField
{
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = null
});
}
}
else
{
foreach (var fd in fds)
{
_inScopeReadFields.Add(new XmlReadField
{
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = fd.Fd.Convert(stringContent)
});
}
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = null
});
}
}
private string ComputeKey(Stack<NodeLevel> nodes) => $"/{string.Join("/", nodes.Select(i => i.Name).Reverse())}";
private void ProcessEndOfNode(Stack<NodeLevel> nodes, string text, Action<XmlNodeParsed> pushResult, string sourceName)
else
{
string key = ComputeKey(nodes);
if (_xmlFieldsDefinitionSearch.Contains(key))
foreach (var fd in fds)
{
ProcessAttributeValue(key, nodes, text);
}
else if (_xmlNodesDefinitionSearch.Contains(key))
{
var (value, nd) = CreateValue(sourceName, key);
pushResult(new XmlNodeParsed
_inScopeReadFields.Add(new XmlReadField
{
NodeDefinitionName = nd.Name,
SourceName = sourceName,
NodePath = nd.NodePath,
Type = nd.Type,
Value = value,
CorrelationKeys = nodes.Select(i => i.Guid).Where(i => i.HasValue).Select(i => i.Value).ToHashSet()
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = fd.Fd.Convert(stringContent)
});
}
ProcessEndOfAnyNode(nodes);
}

private (object value, IXmlNodeDefinition nd) CreateValue(string sourceName, string key)
}
private string ComputeKey(Stack<NodeLevel> nodes) => $"/{string.Join("/", nodes.Select(i => i.Name).Reverse())}";
private void ProcessEndOfNode(Stack<NodeLevel> nodes, string text, Action<XmlNodeParsed> pushResult, string sourceName)
{
string key = ComputeKey(nodes);
if (_xmlFieldsDefinitionSearch.Contains(key))
{
var nd = _xmlFileDefinition.XmlNodeDefinitions.FirstOrDefault(i => i.NodePath == key);
var objectBuilder = new ObjectBuilder(nd.Type);
foreach (var inScopeReadField in _inScopeReadFields.Where(rf => rf.NodeDefinition.NodePath == key))
objectBuilder.Values[inScopeReadField.Definition.TargetPropertyInfo.Name] = inScopeReadField.Value;
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForRowGuid).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = Guid.NewGuid();
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForSourceName).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = sourceName;
return (objectBuilder.CreateInstance(), nd);
ProcessAttributeValue(key, nodes, text);
}

public void Read(Stream fileStream, string sourceName, Action<XmlNodeParsed> pushResult, CancellationToken cancellationToken)
else if (_xmlNodesDefinitionSearch.Contains(key))
{
XmlReaderSettings xrs = new XmlReaderSettings();
foreach (var item in _xmlFileDefinition.PrefixToUriNameSpacesDictionary)
xrs.Schemas.Add(item.Key, item.Value);
xrs.IgnoreWhitespace = true;
xrs.IgnoreComments = true;
xrs.IgnoreProcessingInstructions = true;
var (value, nd) = CreateValue(sourceName, key);
pushResult(new XmlNodeParsed(sourceName, nd.Name, nd.NodePath, nd.Type, value, new Dictionary<Type, Guid>()));
}
ProcessEndOfAnyNode(nodes);
}

private (object value, IXmlNodeDefinition nd) CreateValue(string sourceName, string key)
{
var nd = _xmlFileDefinition.XmlNodeDefinitions.FirstOrDefault(i => i.NodePath == key);
var objectBuilder = new ObjectBuilder(nd.Type);
foreach (var inScopeReadField in _inScopeReadFields.Where(rf => rf.NodeDefinition.NodePath == key))
objectBuilder.Values[inScopeReadField.Definition.TargetPropertyInfo.Name] = inScopeReadField.Value;
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForRowGuid).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = Guid.NewGuid();
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForSourceName).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = sourceName;
return (objectBuilder.CreateInstance(), nd);
}

var xmlReader = XmlReader.Create(fileStream, xrs);
Stack<NodeLevel> nodes = new Stack<NodeLevel>();
string lastTextValue = null;
while (xmlReader.Read())
public void Read(Stream fileStream, CancellationToken cancellationToken)
{
XmlReaderSettings xrs = new XmlReaderSettings();
foreach (var item in _xmlFileDefinition.PrefixToUriNameSpacesDictionary)
xrs.Schemas.Add(item.Key, item.Value);
xrs.IgnoreWhitespace = true;
xrs.IgnoreComments = true;
xrs.IgnoreProcessingInstructions = true;

var xmlReader = XmlReader.Create(fileStream, xrs);
Stack<NodeLevel> nodes = new Stack<NodeLevel>();
string lastTextValue = null;
while (xmlReader.Read())
{
if (cancellationToken.IsCancellationRequested) break;
switch (xmlReader.NodeType)
{
if (cancellationToken.IsCancellationRequested) break;
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
bool isEmptyElement = xmlReader.IsEmptyElement;
lastTextValue = null;
nodes.Push(new NodeLevel { Name = xmlReader.Name, Guid = Guid.NewGuid() });
while (xmlReader.MoveToNextAttribute())
{
nodes.Push(new NodeLevel { Name = $"@{xmlReader.Name}", Guid = null });
ProcessAttributeValue(ComputeKey(nodes), nodes, xmlReader.Value);
nodes.Pop();
}
if (isEmptyElement)
{
ProcessEndOfNode(nodes, null, pushResult, sourceName);
nodes.Pop();
}
break;
case XmlNodeType.EndElement:
ProcessEndOfNode(nodes, lastTextValue, pushResult, sourceName);
lastTextValue = null;
case XmlNodeType.Element:
bool isEmptyElement = xmlReader.IsEmptyElement;
lastTextValue = null;
nodes.Push(new NodeLevel { Name = xmlReader.Name, Guid = Guid.NewGuid() });
while (xmlReader.MoveToNextAttribute())
{
nodes.Push(new NodeLevel { Name = $"@{xmlReader.Name}", Guid = null });
ProcessAttributeValue(ComputeKey(nodes), nodes, xmlReader.Value);
nodes.Pop();
break;
case XmlNodeType.Text:
lastTextValue = xmlReader.Value;
break;
}
}
if (isEmptyElement)
{
ProcessEndOfNode(nodes, null, _pushResult, _sourceName);
nodes.Pop();
}
break;
case XmlNodeType.EndElement:
ProcessEndOfNode(nodes, lastTextValue, _pushResult, _sourceName);
lastTextValue = null;
nodes.Pop();
break;
case XmlNodeType.Text:
lastTextValue = xmlReader.Value;
break;
}
}
private struct NodeLevel
{
public string Name { get; set; }
public Guid? Guid { get; set; }
}
}
private struct NodeLevel
{
public string Name { get; set; }
public Guid? Guid { get; set; }
}
}
Loading

0 comments on commit 89fa010

Please sign in to comment.