Skip to content

Commit

Permalink
Adjust span capturing so that it includes span contents even for non-…
Browse files Browse the repository at this point in the history
…color

styles (such as fonts), but otherwise ignores said styles.
Make color attribute extraction regex-based so that it won't fail in some
conditions.
Update all adapters.
  • Loading branch information
Kinematics committed May 12, 2015
1 parent 13fc0c9 commit 497fa39
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 12 deletions.
13 changes: 10 additions & 3 deletions TallyCore/Adapters/XenForoAdapter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ private string GetBaseSite(string site)
// Bad characters we want to remove
// \u200b = Zero width space (8203 decimal/html). Trim() does not remove this character.
readonly Regex badCharactersRegex = new Regex("\u200b");
// Extract color attributes from span style.
readonly Regex spanColorRegex = new Regex(@"\bcolor\s*:\s*(?<color>\w+)", RegexOptions.IgnoreCase);


#region Public interface functions
Expand Down Expand Up @@ -635,16 +637,21 @@ private string ExtractNodeText(HtmlNode node)
sb.Append("[/u]");
break;
case "span":
// Keep any COLOR styles; ignore anything else, but keep the content
string spanStyle = childNode.GetAttributeValue("style", "");
if (spanStyle.StartsWith("color:", StringComparison.OrdinalIgnoreCase))
Match m = spanColorRegex.Match(spanStyle);
if (m.Success)
{
string spanColor = spanStyle.Substring("color:".Length).Trim();
sb.Append("[color=");
sb.Append(spanColor);
sb.Append(m.Groups["color"].Value);
sb.Append("]");
sb.Append(childNode.InnerText);
sb.Append("[/color]");
}
else
{
sb.Append(childNode.InnerText);
}
break;
case "a":
sb.Append("[url=\"");
Expand Down
13 changes: 10 additions & 3 deletions TallyCore/Adapters/vBulletinAdapter3.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public vBulletinAdapter3(string baseSiteName)
// Bad characters we want to remove
// \u200b = Zero width space (8203 decimal/html). Trim() does not remove this character.
readonly Regex badCharactersRegex = new Regex("\u200b");
// Extract color attributes from span style.
readonly Regex spanColorRegex = new Regex(@"\bcolor\s*:\s*(?<color>\w+)", RegexOptions.IgnoreCase);


#region Public interface functions
Expand Down Expand Up @@ -421,16 +423,21 @@ private string ExtractNodeText(HtmlNode node)
sb.Append("[/u]");
break;
case "span":
// Keep any COLOR styles; ignore anything else, but keep the content
string spanStyle = childNode.GetAttributeValue("style", "");
if (spanStyle.StartsWith("color:", StringComparison.OrdinalIgnoreCase))
Match m = spanColorRegex.Match(spanStyle);
if (m.Success)
{
string spanColor = spanStyle.Substring("color:".Length).Trim();
sb.Append("[color=");
sb.Append(spanColor);
sb.Append(m.Groups["color"].Value);
sb.Append("]");
sb.Append(childNode.InnerText);
sb.Append("[/color]");
}
else
{
sb.Append(childNode.InnerText);
}
break;
case "a":
sb.Append("[url=\"");
Expand Down
13 changes: 10 additions & 3 deletions TallyCore/Adapters/vBulletinAdapter4.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ public vBulletinAdapter4(string site)
// Bad characters we want to remove
// \u200b = Zero width space (8203 decimal/html). Trim() does not remove this character.
readonly Regex badCharactersRegex = new Regex("\u200b");
// Extract color attributes from span style.
readonly Regex spanColorRegex = new Regex(@"\bcolor\s*:\s*(?<color>\w+)", RegexOptions.IgnoreCase);


#region Public interface functions
Expand Down Expand Up @@ -427,16 +429,21 @@ private string ExtractNodeText(HtmlNode node)
sb.Append("[/u]");
break;
case "span":
// Keep any COLOR styles; ignore anything else, but keep the content
string spanStyle = childNode.GetAttributeValue("style", "");
if (spanStyle.StartsWith("color:", StringComparison.OrdinalIgnoreCase))
Match m = spanColorRegex.Match(spanStyle);
if (m.Success)
{
string spanColor = spanStyle.Substring("color:".Length).Trim();
sb.Append("[color=");
sb.Append(spanColor);
sb.Append(m.Groups["color"].Value);
sb.Append("]");
sb.Append(childNode.InnerText);
sb.Append("[/color]");
}
else
{
sb.Append(childNode.InnerText);
}
break;
case "a":
sb.Append("[url=\"");
Expand Down
13 changes: 10 additions & 3 deletions TallyCore/Adapters/vBulletinAdapter5.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ public vBulletinAdapter5(string site)
// Bad characters we want to remove
// \u200b = Zero width space (8203 decimal/html). Trim() does not remove this character.
readonly Regex badCharactersRegex = new Regex("\u200b");
// Extract color attributes from span style.
readonly Regex spanColorRegex = new Regex(@"\bcolor\s*:\s*(?<color>\w+)", RegexOptions.IgnoreCase);


#region Public interface functions
Expand Down Expand Up @@ -397,16 +399,21 @@ private string ExtractNodeText(HtmlNode node)
sb.Append("[/u]");
break;
case "span":
// Keep any COLOR styles; ignore anything else, but keep the content
string spanStyle = childNode.GetAttributeValue("style", "");
if (spanStyle.StartsWith("color:", StringComparison.OrdinalIgnoreCase))
Match m = spanColorRegex.Match(spanStyle);
if (m.Success)
{
string spanColor = spanStyle.Substring("color:".Length).Trim();
sb.Append("[color=");
sb.Append(spanColor);
sb.Append(m.Groups["color"].Value);
sb.Append("]");
sb.Append(childNode.InnerText);
sb.Append("[/color]");
}
else
{
sb.Append(childNode.InnerText);
}
break;
case "a":
sb.Append("[url=\"");
Expand Down

0 comments on commit 497fa39

Please sign in to comment.