-
Notifications
You must be signed in to change notification settings - Fork 384
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #247 from ddosify/develop
implement html extration feature
- Loading branch information
Showing
11 changed files
with
273 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package extraction | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
|
||
"github.com/antchfx/htmlquery" | ||
) | ||
|
||
type htmlExtractor struct { | ||
} | ||
|
||
func (xe htmlExtractor) extractFromByteSlice(source []byte, xPath string) (interface{}, error) { | ||
reader := bytes.NewBuffer(source) | ||
rootNode, err := htmlquery.Parse(reader) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// returns the first matched element | ||
foundNode, err := htmlquery.Query(rootNode, xPath) | ||
if foundNode == nil || err != nil { | ||
return nil, fmt.Errorf("no match for the xPath_html: %s", xPath) | ||
} | ||
|
||
return foundNode.FirstChild.Data, nil | ||
} | ||
|
||
func (xe htmlExtractor) extractFromString(source string, xPath string) (interface{}, error) { | ||
reader := bytes.NewBufferString(source) | ||
rootNode, err := htmlquery.Parse(reader) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// returns the first matched element | ||
foundNode, err := htmlquery.Query(rootNode, xPath) | ||
if foundNode == nil || err != nil { | ||
return nil, fmt.Errorf("no match for this xpath_html") | ||
} | ||
|
||
return foundNode.FirstChild.Data, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package extraction | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
"testing" | ||
) | ||
|
||
func TestHtmlExtraction(t *testing.T) { | ||
expected := "Html Title" | ||
HtmlSource := fmt.Sprintf(`<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<h1>%s</h1> | ||
<p>My first paragraph.</p> | ||
</body> | ||
</html>`, expected) | ||
|
||
xe := htmlExtractor{} | ||
xpath := "//body/h1" | ||
val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath) | ||
|
||
if err != nil { | ||
t.Errorf("TestHtmlExtraction %v", err) | ||
} | ||
|
||
if !strings.EqualFold(val.(string), expected) { | ||
t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val) | ||
} | ||
} | ||
|
||
func TestHtmlExtractionSeveralNode(t *testing.T) { | ||
//should extract only the first one | ||
expected := "Html Title" | ||
HtmlSource := fmt.Sprintf(`<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<h1>%s</h1> | ||
<h1>another node</h1> | ||
<p>My first paragraph.</p> | ||
</body> | ||
</html>`, expected) | ||
|
||
xe := htmlExtractor{} | ||
xpath := "//h1" | ||
val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath) | ||
|
||
if err != nil { | ||
t.Errorf("TestHtmlExtraction %v", err) | ||
} | ||
|
||
if !strings.EqualFold(val.(string), expected) { | ||
t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val) | ||
} | ||
} | ||
|
||
func TestHtmlExtraction_PathNotFound(t *testing.T) { | ||
expected := "XML Title" | ||
xmlSource := fmt.Sprintf(`<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<h1>%s</h1> | ||
<h1>another node</h1> | ||
<p>My first paragraph.</p> | ||
</body> | ||
</html>`, expected) | ||
|
||
xe := htmlExtractor{} | ||
xpath := "//h2" | ||
_, err := xe.extractFromByteSlice([]byte(xmlSource), xpath) | ||
|
||
if err == nil { | ||
t.Errorf("TestHtmlExtraction_PathNotFound, should be err, got :%v", err) | ||
} | ||
} | ||
|
||
func TestInvalidHtml(t *testing.T) { | ||
xmlSource := `invalid html source` | ||
|
||
xe := htmlExtractor{} | ||
xpath := "//input" | ||
_, err := xe.extractFromByteSlice([]byte(xmlSource), xpath) | ||
|
||
if err == nil { | ||
t.Errorf("TestInvalidXml, should be err, got :%v", err) | ||
} | ||
} | ||
|
||
func TestHtmlComplexExtraction(t *testing.T) { | ||
expected := "Html Title" | ||
HtmlSource := fmt.Sprintf(`<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<script> | ||
if (typeof resourceLoadedSuccessfully === "function") { | ||
resourceLoadedSuccessfully(); | ||
} | ||
$(() => { | ||
typeof cssVars === "function" && cssVars({onlyLegacy: true}); | ||
}) | ||
var trackGeoLocation = false; | ||
alert('#@=$*€'); | ||
</script> | ||
<h1>%s</h1> | ||
<p>My first paragraph.</p> | ||
</body> | ||
</html>`, expected) | ||
|
||
xe := htmlExtractor{} | ||
xpath := "//body/h1" | ||
val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath) | ||
|
||
if err != nil { | ||
t.Errorf("TestHtmlExtraction %v", err) | ||
} | ||
|
||
if !strings.EqualFold(val.(string), expected) { | ||
t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.