Skip to content

Commit

Permalink
feat(words): add methods to split text into words
Browse files Browse the repository at this point in the history
  • Loading branch information
joeycumines committed Aug 19, 2019
1 parent 1ecbe3e commit c5c5000
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 18 deletions.
2 changes: 2 additions & 0 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ As of v1.0.0 the API is stable and used in multiple (personal) projects. Unless

## Change Log

**2019-08-20** v1.2.0 words methods

**2019-02-14** v1.1.0 classes methods

**2019-02-11** v1.0.0 initial release
30 changes: 28 additions & 2 deletions htmlutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
*/

// Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to
// find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension.
Expand Down Expand Up @@ -153,7 +153,14 @@ func (n Node) OuterHTML() string {

// OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n`
func (n Node) OuterText() string {
return encodeText(n.Data)
return string(encodeText(n.Data))
}

// OuterWords builds a space-separated string from the whitespace-separated data of all text nodes in the sub-tree,
// starting from and including `n`, note that text separated / split across multiple elements will be considered as
// multiple words (words within non-empty sibling elements will be split by a single space)
func (n Node) OuterWords() string {
return string(encodeWords(n.Data))
}

// InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method)
Expand Down Expand Up @@ -182,6 +189,25 @@ func (n Node) InnerText(filters ...func(node Node) bool) string {
return string(b)
}

// InnerWords builds a string using the outer words of all children matching all filters (see the `FindNode` method and
// the `OuterWords` methods)
func (n Node) InnerWords(filters ...func(node Node) bool) string {
var b []byte
n.Range(
func(i int, node Node) bool {
if s := node.OuterWords(); s != `` {
if len(b) != 0 {
b = append(b, ' ')
}
b = append(b, []byte(s)...)
}
return true
},
filters...,
)
return string(b)
}

// SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method)
func (n Node) SiblingIndex(filters ...func(node Node) bool) int {
return siblingIndex(n, filters...)
Expand Down
34 changes: 32 additions & 2 deletions htmlutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
*/

package htmlutil

Expand Down Expand Up @@ -339,11 +339,27 @@ func TestEncodeHTML_panic(t *testing.T) {
}

func TestEncodeText_nil(t *testing.T) {
if v := encodeText(nil); v != "" {
if v := encodeText(nil); v != nil {
t.Fatal(v)
}
}

func TestEncodeWords_nil(t *testing.T) {
if v := encodeWords(nil); v != nil {
t.Fatal(v)
}
}

func TestEncodeWords_siblings(t *testing.T) {
node, err := Parse(strings.NewReader(`<div>one</div><div>two</div><div><div><div></div></div></div><div></div><div><div></div><div>three</div></div><div>four</div>`))
if err != nil {
t.Fatal(err)
}
if v := string(encodeWords(node.Data)); v != `one two three four` {
t.Error(v)
}
}

func TestParse_eof(t *testing.T) {
reader, _ := io.Pipe()
_ = reader.Close()
Expand Down Expand Up @@ -425,6 +441,20 @@ FOUR !
` {
t.Fatal(v)
}
if v := node.InnerWords(); v != `ONE TWO THREE FOUR !` {
t.Fatal(v)
}
if v := node.InnerWords(func(node Node) bool {
return node.Offset() == 0 &&
node.Type() == html.TextNode
}); v != `FOUR !` {
t.Fatal(v)
}
if v := node.InnerWords(func(node Node) bool {
return node.Offset() == 100
}); v != `` {
t.Fatal(v)
}
}

func TestNode_GetAttr_caseInsensitive(t *testing.T) {
Expand Down
46 changes: 33 additions & 13 deletions internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
*/

package htmlutil

Expand Down Expand Up @@ -154,7 +154,18 @@ func getNode(node Node, filters ...func(node Node) bool) Node {
return result
}

func encodeTextBytes(node *html.Node) []byte {
func encodeHTML(node *html.Node) string {
if node == nil {
return ""
}
buffer := new(bytes.Buffer)
if err := html.Render(buffer, node); err != nil {
panic(err)
}
return buffer.String()
}

func encodeText(node *html.Node) []byte {
if node == nil {
return nil
}
Expand All @@ -163,24 +174,33 @@ func encodeTextBytes(node *html.Node) []byte {
}
var b []byte
for node := node.FirstChild; node != nil; node = node.NextSibling {
b = append(b, encodeTextBytes(node)...)
b = append(b, encodeText(node)...)
}
return b
}

func encodeText(node *html.Node) string {
return string(encodeTextBytes(node))
}

func encodeHTML(node *html.Node) string {
func encodeWords(node *html.Node) (b []byte) {
if node == nil {
return ""
return
}
buffer := new(bytes.Buffer)
if err := html.Render(buffer, node); err != nil {
panic(err)
if node.Type == html.TextNode {
for _, word := range strings.Fields(node.Data) {
if len(b) != 0 {
b = append(b, ' ')
}
b = append(b, []byte(word)...)
}
return
}
return buffer.String()
for node := node.FirstChild; node != nil; node = node.NextSibling {
if words := encodeWords(node); len(words) != 0 {
if len(b) != 0 {
b = append(b, ' ')
}
b = append(b, words...)
}
}
return
}

func getAttr(namespace string, key string, attributes ...html.Attribute) (html.Attribute, bool) {
Expand Down
2 changes: 1 addition & 1 deletion internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
*/

package htmlutil

Expand Down

0 comments on commit c5c5000

Please sign in to comment.