Skip to content

Commit

Permalink
add targeturlselector test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
Lewis Zou committed Feb 15, 2017
1 parent 78a3630 commit 979555f
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 5 deletions.
33 changes: 33 additions & 0 deletions src/DotnetSpider2.Core/Processor/BasePageProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Text.RegularExpressions;
using DotnetSpider.Core.Selector;
using System.Linq;
using System;
#if !NET_CORE
using System.Web;
#else
Expand All @@ -16,6 +17,7 @@ public abstract class BasePageProcessor : IPageProcessor
private List<Regex> _excludeTargetUrlPatterns = new List<Regex>();
private Dictionary<ISelector, List<Regex>> _targetUrlExtractors { get; set; } = new Dictionary<ISelector, List<Regex>>();
private ISelector _imageSelector = Selectors.XPath(".//img/@src");

protected abstract void Handle(Page page);

public void Process(Page page)
Expand Down Expand Up @@ -225,6 +227,37 @@ protected virtual void AddExcludeTargetUrlPattern(params string[] patterns)
}
}

/// <summary>
/// Only used for test
/// </summary>
/// <param name="region"></param>
/// <returns></returns>
[Obsolete]
public virtual List<Regex> GetTargetUrlPatterns(string region)
{
var selector = Selectors.XPath(region);
if(_targetUrlExtractors.ContainsKey(selector))
{
return _targetUrlExtractors[selector];
}
else
{
return null;
}
}

/// <summary>
/// Only used for test
/// </summary>
/// <param name="region"></param>
/// <returns></returns>
[Obsolete]
public virtual bool ContainsTargetUrlRegion(string region)
{
var selector = Selectors.XPath(region);
return _targetUrlExtractors.ContainsKey(selector);
}

/// <summary>
/// Get the site settings
/// </summary>
Expand Down
1 change: 1 addition & 0 deletions src/DotnetSpider2.Extension.Test/EntitySpiderTest2.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using DotnetSpider.Extension.Pipeline;
using MySql.Data.MySqlClient;
using Xunit;
using DotnetSpider.Extension.Processor;

namespace DotnetSpider.Extension.Test
{
Expand Down
178 changes: 178 additions & 0 deletions src/DotnetSpider2.Extension.Test/TargetUrlSelectorTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
using DotnetSpider.Core;
using DotnetSpider.Extension.Model;
using DotnetSpider.Extension.Model.Attribute;
using DotnetSpider.Extension.Processor;
using System;
using System.Reflection;
using Xunit;

namespace DotnetSpider.Extension.Test
{
public class TargetUrlSelectorTest
{
[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity14 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]", "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity16 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&", @"&page=[0-1]+&" })]
public class Entity17 : ISpiderEntity
{
}

[TargetUrlsSelector()]
public class Entity15 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]", "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-9]+&", @"&page=[0-1]+&" })]
public class Entity18 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-1]+&" })]
[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity19 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-1]+&" })]
[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity20 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity21 : ISpiderEntity
{
}

[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
[TargetUrlsSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Entity22 : ISpiderEntity
{
}

[Fact]
public void TargetUrlsSelector_1Region_1Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity14).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());
Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"222\"]") == null);
}

[Fact]
public void TargetUrlsSelector_2Region_1Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity16).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());

Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"2222\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"2222\"]")[0].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_1Region_2Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity17).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());
Assert.Equal(@"&page=[0-1]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[1].ToString());
Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_Null()
{
try
{
var entity2 = EntitySpider.ParseEntityMetaData(typeof(Entity15).GetTypeInfo());
var processor2 = new EntityProcessor(new Site(), entity2);
}
catch (Exception e)
{
Assert.Equal("Region xpath and patterns should not be null both.", e.Message);
}
}

[Fact]
public void TargetUrlsSelector_2Region_2Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity18).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());
Assert.Equal(@"&page=[0-1]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[1].ToString());

Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"2222\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"2222\"]")[0].ToString());
Assert.Equal(@"&page=[0-1]+&", processor.GetTargetUrlPatterns("//*[@id=\"2222\"]")[1].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_Multi_2Region_2Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity19).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());

Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"2222\"]").Count);
Assert.Equal(@"&page=[0-1]+&", processor.GetTargetUrlPatterns("//*[@id=\"2222\"]")[0].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_Multi_2SameRegion_2Pattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity20).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-1]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[1].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_Multi_2SameRegion_2SamePattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity21).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}

[Fact]
public void TargetUrlsSelector_Multi_2Region_2SamePattern()
{
var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity22).GetTypeInfo());
var processor = new EntityProcessor(new Site(), entity1);
Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString());

Assert.Equal(1, processor.GetTargetUrlPatterns("//*[@id=\"2222\"]").Count);
Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"2222\"]")[0].ToString());

Assert.True(processor.GetTargetUrlPatterns("//*[@id=\"3333\"]") == null);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@
<Compile Include="..\Scheduler\RedisSchedulerTest.cs">
<Link>Scheduler\RedisSchedulerTest.cs</Link>
</Compile>
<Compile Include="..\TargetUrlSelectorTest.cs">
<Link>TargetUrlSelectorTest.cs</Link>
</Compile>
</ItemGroup>
<ItemGroup>
<Folder Include="Properties\" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

namespace DotnetSpider.Extension.Model.Attribute
{
[AttributeUsage(AttributeTargets.Class)]
[AttributeUsage(AttributeTargets.Class, AllowMultiple = true)]
public class TargetUrlsSelector : System.Attribute
{
public string[] XPaths { get; set; } = new string[0];
public string[] Patterns { get; set; } = new string[0];
public string[] XPaths { get; set; }
public string[] Patterns { get; set; }
}
}
2 changes: 0 additions & 2 deletions src/DotnetSpider2.Extension/Processor/EntityProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
using DotnetSpider.Extension.Model;
using Site = DotnetSpider.Core.Site;
using Newtonsoft.Json.Linq;
using DotnetSpider.Core.Selector;
using System.Linq;
using System.Text.RegularExpressions;

namespace DotnetSpider.Extension.Processor
{
Expand Down

0 comments on commit 979555f

Please sign in to comment.