Skip to content

Commit

Permalink
Refactor extraction planner to improve Collection support
Browse files Browse the repository at this point in the history
This was a major overhaul of the extraction planner that was designed
to improve support for Initializable Collections. The majority of the work
involved consisted of moving knowledge of what it means to be a Collection
into a new kind of `ExtractionPlanNode`, namely a `CollectionInitializerNode`.

RegExtract previously had a fairly unnatural notion of List<T>, blending
both the List-ness of a node with the underlying type T. Even weirder,
ListOfListNode was a whole other animal, that handled nesting of containers.

There is now better support for list patterns of either of the following
forms:
1. @"((\d+),? ?)+", or
2. @"(?:(\d+),? ?)+"

Previously, only the latter type of pattern (with a non-capturing group) was
supported if you needed to select out a substring from within the pattern
repeated inside the list.
  • Loading branch information
sblom committed Dec 15, 2023
1 parent 588c7fb commit 29e721f
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 155 deletions.
125 changes: 67 additions & 58 deletions RegExtract.Test/Usage.cs
Original file line number Diff line number Diff line change
@@ -1,60 +1,99 @@
using System;
using System.Linq;

using Xunit;

using RegExtract;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using Xunit.Abstractions;

namespace RegExtract.Test
{
public class Usage
{
private readonly ITestOutputHelper output;

public Usage(ITestOutputHelper output)
{
this.output = output;
}

const string data = "123456789";
const string pattern = "(.)(.)(.)(.)(.)(.)(.)(.)(.)";
const string pattern_nested = "(((.)(.)(.)(.)(.)(.)(.)(.)(.)))";
const string pattern_named = "(?<n>(?<s>(?<a>.)(?<b>.)(?<c>.)(?<d>.)(?<e>.)(?<f>.)(?<g>.)(?<h>.)(?<i>.)))";

[Fact]
public void can_parse_lookbehind()
public void a001()
{
data.Extract<string>(@"(?<=(12))");
var str = ExtractionPlan<List<(char, char)>>.CreatePlan(new Regex(@"((\w)(\w))+")).ToString("x");
output.WriteLine(str);
}

[Fact]
public void can_extract_to_tuple()
public void a002()
{
var (a, b, c, d, e, f, g, h, i) = data.Extract<(int, char, string, int, char, string, int, char, string)>(pattern);
var str = ExtractionPlan<List<int>>.CreatePlan(new Regex(@"((\d+) ?)+")).ToString("x");
output.WriteLine(str);
}

Assert.IsType<int>(a);
Assert.IsType<char>(b);
Assert.IsType<string>(c);
Assert.IsType<int>(d);
Assert.IsType<char>(e);
Assert.IsType<string>(f);
Assert.IsType<int>(g);
Assert.IsType<char>(h);
Assert.IsType<string>(i);
record game(int id, List<draw> draws);
record draw(List<(int count, string color)> colors);

Assert.Equal(1, a);
Assert.Equal('2', b);
Assert.Equal("3", c);
Assert.Equal(4, d);
Assert.Equal('5', e);
Assert.Equal("6", f);
Assert.Equal(7, g);
Assert.Equal('8', h);
Assert.Equal("9", i);
[Fact]
public void a003()
{
var plan = ExtractionPlan<game>.CreatePlan(new Regex(@"Game (\d+): (((\d+) (\w+),? ?)+;? ?)+"));
var str = plan.ToString("x");
output.WriteLine(str);

var result = plan.Extract("Game 31: 9 blue, 6 red, 7 green; 20 red, 1 green, 15 blue; 6 blue, 7 green, 17 red; 2 blue, 3 green, 6 red; 1 red, 3 blue, 2 green; 5 green, 18 red, 6 blue");
}

[Fact]
public void a004()
{
var plan = ExtractionPlan<List<(char, int)>>.CreatePlan(new Regex(@"(([RL])(\d+),? ?)+"));
var str = plan.ToString("x");
output.WriteLine(str);

var result = plan.Extract("R8, R4, L4, R8");
}


[Fact]
public void a005()
{
var plan = ExtractionPlan<Dictionary<string, (string left, string right)>>.CreatePlan(new Regex(@"((...) = \(((...), (...))\);? ?)+"));
var str = plan.ToString("x");
output.WriteLine(str);

var result = plan.Extract(@"AAA = (BBB, CCC); BBB = (DDD, EEE)");
}

[Fact]
public void a006()
{
var str = ExtractionPlan<List<int>>.CreatePlan(new Regex(@"((\d+) ?)+")).ToString("x");
output.WriteLine(str);
}

[Fact]
public void can_extract_to_tuple_nested()
public void a007()
{
var (n, s, a, b, c, d, e, f, g, h, i) = data.Extract<(long, string, int, char, string, int, char, string, int, char, string)>(pattern_nested);
var str = ExtractionPlan<List<int>>.CreatePlan(new Regex(@"(?:(\d+) ?)+")).ToString("x");
output.WriteLine(str);
}

Assert.IsType<long>(n);
Assert.IsType<string>(s);
[Fact]
public void can_parse_lookbehind()
{
data.Extract<string>(@"(?<=(12))");
}

[Fact]
public void can_extract_to_tuple()
{
var (a, b, c, d, e, f, g, h, i) = data.Extract<(int, char, string, int, char, string, int, char, string)>(pattern);

Assert.IsType<int>(a);
Assert.IsType<char>(b);
Expand All @@ -66,9 +105,6 @@ public void can_extract_to_tuple_nested()
Assert.IsType<char>(h);
Assert.IsType<string>(i);

Assert.Equal(123456789, n);
Assert.Equal("123456789", s);

Assert.Equal(1, a);
Assert.Equal('2', b);
Assert.Equal("3", c);
Expand All @@ -80,12 +116,6 @@ public void can_extract_to_tuple_nested()
Assert.Equal("9", i);
}

[Fact]
public void fails_when_tuple_is_wrong_arity()
{
Assert.Throws<ArgumentException>(() => data.Extract<(int, char, string, int, char, string, int, char, string)>(pattern_nested));
}

record PositionalRecord(int a, char b, string c, int d, char e, string f, int g, char h, string i);

[Fact]
Expand Down Expand Up @@ -267,12 +297,6 @@ public void can_extract_to_string_constructor()
var result = "https://www.google.com/ 12345".Extract<(Uri,int)>(@"(.*) (\d+)");
}

[Fact]
public void can_extract_nested_to_string_constructor()
{
var result = "https://www.google.com/ 12345".Extract<(Uri, string, int)>(@"(((.*))) (\d+)");
}

[Fact]
public void regex_does_not_match()
{
Expand Down Expand Up @@ -366,16 +390,6 @@ record bagdescription
}
record includedbags(int? num, string name);

[Fact]
public void debug2()
{
//var plan = RegexExtractionPlan.CreatePlan<List<List<char>>>(@"(?:((\w)+) ?)+");
//var result = plan.Execute(Regex.Match("The quick brown fox jumps over the lazy dog", @"(?:((\w)+) ?)+"));

var regex = new Regex(@"(((\d+)-(\d+)) (.): (.*))");
var plan = ExtractionPlan<List<(string, (int?, int?)?, char, string)?>>.CreatePlan(regex);
}

[Fact]
public void CreateTreePlan()
{
Expand All @@ -387,11 +401,6 @@ public void CreateTreePlan()
var plan2 = ExtractionPlan<List<List<char>>>.CreatePlan(regex);

result = plan2.Extract(regex.Match("The quick brown fox jumps over the lazy dog"));

regex = new Regex(@"(((\d+)-(\d+)) (.): (.*))+");
var plan3 = ExtractionPlan<List<(string, (int?, int?)?, char, string)?>>.CreatePlan(regex);

result = plan3.Extract(regex.Match("2-12 c: abcdefgji"));
}

[Fact]
Expand Down
95 changes: 48 additions & 47 deletions RegExtract/ExtractionPlan.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;

using RegExtract.ExtractionPlanNodeTypes;
Expand Down Expand Up @@ -44,17 +44,21 @@ internal void InitializePlan(Regex regex)
_tree = new RegexCaptureGroupTree(regex);
Type type = typeof(T);

Plan = AssignTypesToTree_0(_tree.Tree, type);
Plan = AssignTypesToTree(_tree.Tree, type);
}


protected const string VALUETUPLE_TYPENAME = "System.ValueTuple`";
protected const string NULLABLE_TYPENAME = "System.Nullable`";

protected bool IsCollection(Type type)
// We use C#'s definition of an initializable collection, which is any type that implements IEnumerable and has a public Add() method.
// In our case, we also require that the Add() method has parameters of the same type as the collection's generic parameters.
protected bool IsInitializableCollection(Type type)
{
return type.GetInterfaces()
.Any(i => i.IsGenericType && i.GetGenericTypeDefinition() == typeof(ICollection<>));
var genericParameters = type.GetGenericArguments();
var addMethod = type.GetMethod("Add", BindingFlags.Public | BindingFlags.Instance, null, genericParameters, null);

return type.GetInterfaces().Any(i => i == typeof(IEnumerable)) && addMethod != null;
}

protected bool IsTuple(Type type)
Expand Down Expand Up @@ -82,12 +86,6 @@ protected bool IsDirectlyConstructable(Type type)
return true;
}

if (IsCollection(type))
{
type = type.GetGenericArguments().Single();
return !IsCollection(type) && IsDirectlyConstructable(type);
}

if (IsNullable(type))
{
type = type.GetGenericArguments().Single();
Expand Down Expand Up @@ -131,24 +129,6 @@ protected Type[] GetTupleArgumentsList(Type type)
}
}

private ExtractionPlanNode AssignTypesToTree_0(RegexCaptureGroupNode tree, Type type)
{
var unwrappedType = IsCollection(type) ? type.GetGenericArguments().Single() : type;
unwrappedType = IsNullable(unwrappedType) ? unwrappedType.GetGenericArguments().Single() : unwrappedType;

if (!tree.children.Any())
{
return ExtractionPlanNode.BindLeaf("0", type, new ExtractionPlanNode[0], new ExtractionPlanNode[0]);
}

if (!IsTuple(unwrappedType) && !IsContainerOfSize(unwrappedType, tree.NumberedGroups.Count()) && !tree.NamedGroups.Any())
{
return new VirtualUnaryTupleNode(tree.name, type, new ExtractionPlanNode[] { AssignTypesToTree_Recursive(tree.children.Single(), type) }, new ExtractionPlanNode[0]);
}

return AssignTypesToTree_Recursive(tree, type);
}

ExtractionPlanNode BindPropertyPlan(RegexCaptureGroupNode tree, Type type, string name)
{
if (IsNullable(type))
Expand All @@ -163,16 +143,11 @@ ExtractionPlanNode BindPropertyPlan(RegexCaptureGroupNode tree, Type type, strin

type = property.PropertyType;

return AssignTypesToTree_Recursive(tree, type);
return AssignTypesToTree(tree, type);
}

ExtractionPlanNode BindConstructorPlan(RegexCaptureGroupNode tree, Type type, int paramNum, int paramCount, Stack<RegexCaptureGroupNode>? stack)
{
if (IsCollection(type))
{
type = type.GetGenericArguments().Single();
}

if (IsNullable(type))
{
type = type.GetGenericArguments().Single();
Expand All @@ -181,7 +156,18 @@ ExtractionPlanNode BindConstructorPlan(RegexCaptureGroupNode tree, Type type, in
var constructors = type.GetConstructors()
.Where(cons => cons.GetParameters().Length == paramCount);

if (IsTuple(type))
if (IsInitializableCollection(type))
{
try
{
type = type.GetGenericArguments()[paramNum];
}
catch (IndexOutOfRangeException)
{
throw new ArgumentException($"Capture group '{tree.name}' represents too many parameters for collection {type.FullName}");
}
}
else if (IsTuple(type))
{
try
{
Expand All @@ -206,26 +192,21 @@ ExtractionPlanNode BindConstructorPlan(RegexCaptureGroupNode tree, Type type, in
}
}

return AssignTypesToTree_Recursive(tree, type, stack);
return AssignTypesToTree(tree, type, stack);
}

private ExtractionPlanNode AssignTypesToTree_Recursive(RegexCaptureGroupNode tree, Type type, Stack<RegexCaptureGroupNode>? stack = null)
private ExtractionPlanNode AssignTypesToTree(RegexCaptureGroupNode tree, Type type, Stack<RegexCaptureGroupNode>? stack = null)
{
var unwrappedType = IsCollection(type) ? type.GetGenericArguments().Single() : type;
unwrappedType = IsNullable(unwrappedType) ? unwrappedType.GetGenericArguments().Single() : unwrappedType;
var unwrappedType = IsNullable(type) ? type.GetGenericArguments().Single() : type;

List<ExtractionPlanNode> groups = new();
List<ExtractionPlanNode> namedgroups = new();

if (!tree.children.Any() || IsDirectlyConstructable(type))
if (IsDirectlyConstructable(type))
{
if (tree.children.Any())
{
if (stack == null) throw new ArgumentException("Leftover branch in Rx subtree but no tuple with extra slots to receive it.");
foreach (var child in tree.children.Reverse())
{
stack.Push(child);
}
return new VirtualUnaryTupleNode(tree.children.Single().name, type, new[] { AssignTypesToTree(tree.children.Single(), type) }, new ExtractionPlanNode[0]);
}
return ExtractionPlanNode.BindLeaf(tree.name, type, groups.ToArray(), namedgroups.ToArray());
}
Expand All @@ -249,6 +230,27 @@ private ExtractionPlanNode AssignTypesToTree_Recursive(RegexCaptureGroupNode tre
}
}
}
else if (IsInitializableCollection(type))
{
var typeParams = type.GetGenericArguments();

if (tree.name == "0")
{
return new VirtualUnaryTupleNode(tree.name, type, new[] { AssignTypesToTree(tree.children.Single(), type) }, new ExtractionPlanNode[0]);
}

if (typeParams.Length < 2)
{
return ExtractionPlanNode.Bind(tree.name, type, new[] { BindConstructorPlan(tree, type, 0, 1, stack) }, new ExtractionPlanNode[0]);
}

foreach (var node in tree.children)
{
var plan = BindConstructorPlan(node, type, groups.Count, tree.NumberedGroups.Count(), stack);
groups.Add(plan);
}
// TODO: assert that there are no named groups
}
else
{
foreach (var node in tree.children)
Expand All @@ -262,7 +264,6 @@ private ExtractionPlanNode AssignTypesToTree_Recursive(RegexCaptureGroupNode tre
{
namedgroups.Add(BindPropertyPlan(node, type, node.name));
}

}
}

Expand Down
Loading

0 comments on commit 29e721f

Please sign in to comment.