From 6c2387aa2ea98bffd1a9625c54ec00905ab77f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C7=8Eili=C3=A0ng=20W=C3=A1ng?= Date: Mon, 20 Jan 2014 14:54:22 +0800 Subject: [PATCH] initial commit. --- LICENSE | 22 ++ README.md | 101 +++++ auto_chain.go | 937 ++++++++++++++++++++++++++++++++++++++++++++++ chain.go | 84 +++++ examples/main.go | 54 +++ expr/auto_expr.go | 247 ++++++++++++ expr/bfs.go | 85 +++++ expr/checker.go | 221 +++++++++++ expr/dfs.go | 116 ++++++ expr/getter.go | 90 +++++ expr/util.go | 56 +++ gen/gen.go | 180 +++++++++ gen/main.go | 26 ++ gen/spec.go | 150 ++++++++ gen/util.go | 19 + node.go | 269 +++++++++++++ pretty.go | 365 ++++++++++++++++++ util.go | 13 + 18 files changed, 3035 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 auto_chain.go create mode 100644 chain.go create mode 100644 examples/main.go create mode 100644 expr/auto_expr.go create mode 100644 expr/bfs.go create mode 100644 expr/checker.go create mode 100644 expr/dfs.go create mode 100644 expr/getter.go create mode 100644 expr/util.go create mode 100644 gen/gen.go create mode 100644 gen/main.go create mode 100644 gen/spec.go create mode 100644 gen/util.go create mode 100644 node.go create mode 100644 pretty.go create mode 100644 util.go diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ec936a6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2014, Hǎiliàng Wáng. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..37e3e1a --- /dev/null +++ b/README.md @@ -0,0 +1,101 @@ +html-query: A fluent and functional approach to querying HTML +============================================================= + +html-query is a Go package that provides a fluent and functional interface for +querying HTML. It is based on code.google.com/p/go.net/html. + +Examples +======== +1. A simple example (under "examples" directory) + + r := get(`http://blog.golang.org/index`) + defer r.Close() + root, err := query.Parse(r) + checkError(err) + root.Div(Id("content")).Children(Class("blogtitle")).For(func(item *query.Node) { + href := item.Ahref().Href() + date := item.Span(Class("date")).Text() + tags := item.Span(Class("tags")).Text() + if href != nil { + pn(*href) + } + if date != nil { + pn(*date) + } + if tags != nil { + p(*tags) + } + }) + +2. Generator of html-query (under "gen" directory) +...A large part of html-query is automatically generated from HTML spec. The +...spec is in HTML format. So the generator parses it using html-query itself. + +Design +====== +Here is a simple explanation of the design of html-query. +###Functional query expressions +All functional definitions are defined in html-query/expr package. + +1. Checker and checker composition +...A checker is a function that accept and conditionally returns a *html.Node. + + type Checker func(*html.Node) *html.Node + +...Here are some checker examples: + + Id("id1") + Class("c1") + Div + Abbr + H1 + H2 + +...Checkers can be combined as boolean expressions: + + And(Id("id1"), Class("c1")) + Or(Class("c1"), Class("c2")) + And(Class("c1"), Not(Class("c2"))) + +2. Checker builder +...A checker builder is a function that returns a checker. "Id", "Class", "And", +..."Or", "Not" shown above are all checker builders. There are also some checker +...builder builder (function that returns a checker builder) defined in +...html-query when needed. + +###Fluent interface +Fluent interface (http://en.wikipedia.org/wiki/Fluent_interface) are defined in +html-query package. + +1. Root node +...Function Parse returns the root node of an html document. + +2. Node finder +...Method Node.Find implements a BFS search for a node, e.g. + + node.Find(Div, Class("id1")) + +...But usually you can write the short form: + + node.Div(Class("id1")) + +3. Attribute getter +...Method Node.Attr can be used to get the value (or a regular expression +...submatch of the value) of a node, e.g. + + node.Attr("Id") + node.Attr("href", "\(.*)") + +...But usually you can write the short form: + + node.Id() + node.Href("\(.*)") + +4. Node iterator +...Method Node.Children and Node.Descendants each returns a node iterator +...(NodeIter). Method NodeIter.For can be used to loop through these nodes. + +Alternative +=========== +If you prefer a jquery like DSL rather than functional way, you might want to +try goquery: https://github.com/PuerkitoBio/goquery. diff --git a/auto_chain.go b/auto_chain.go new file mode 100644 index 0000000..33aa068 --- /dev/null +++ b/auto_chain.go @@ -0,0 +1,937 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package query + +import ( + . "github.com/hailiang/html-query/expr" +) + +func (n *Node) A(cs ...Checker) *Node { + return n.find(A, cs) +} + +func (n *Node) Abbr(cs ...Checker) *Node { + return n.find(Abbr, cs) +} + +func (n *Node) Address(cs ...Checker) *Node { + return n.find(Address, cs) +} + +func (n *Node) Area(cs ...Checker) *Node { + return n.find(Area, cs) +} + +func (n *Node) Article(cs ...Checker) *Node { + return n.find(Article, cs) +} + +func (n *Node) Aside(cs ...Checker) *Node { + return n.find(Aside, cs) +} + +func (n *Node) Audio(cs ...Checker) *Node { + return n.find(Audio, cs) +} + +func (n *Node) B(cs ...Checker) *Node { + return n.find(B, cs) +} + +func (n *Node) Base(cs ...Checker) *Node { + return n.find(Base, cs) +} + +func (n *Node) Bdi(cs ...Checker) *Node { + return n.find(Bdi, cs) +} + +func (n *Node) Bdo(cs ...Checker) *Node { + return n.find(Bdo, cs) +} + +func (n *Node) Blockquote(cs ...Checker) *Node { + return n.find(Blockquote, cs) +} + +func (n *Node) Body(cs ...Checker) *Node { + return n.find(Body, cs) +} + +func (n *Node) Br(cs ...Checker) *Node { + return n.find(Br, cs) +} + +func (n *Node) Button(cs ...Checker) *Node { + return n.find(Button, cs) +} + +func (n *Node) Canvas(cs ...Checker) *Node { + return n.find(Canvas, cs) +} + +func (n *Node) Caption(cs ...Checker) *Node { + return n.find(Caption, cs) +} + +func (n *Node) Cite(cs ...Checker) *Node { + return n.find(Cite, cs) +} + +func (n *Node) Code(cs ...Checker) *Node { + return n.find(Code, cs) +} + +func (n *Node) Col(cs ...Checker) *Node { + return n.find(Col, cs) +} + +func (n *Node) Colgroup(cs ...Checker) *Node { + return n.find(Colgroup, cs) +} + +func (n *Node) Data(cs ...Checker) *Node { + return n.find(Data, cs) +} + +func (n *Node) Datalist(cs ...Checker) *Node { + return n.find(Datalist, cs) +} + +func (n *Node) Dd(cs ...Checker) *Node { + return n.find(Dd, cs) +} + +func (n *Node) Del(cs ...Checker) *Node { + return n.find(Del, cs) +} + +func (n *Node) Details(cs ...Checker) *Node { + return n.find(Details, cs) +} + +func (n *Node) Dfn(cs ...Checker) *Node { + return n.find(Dfn, cs) +} + +func (n *Node) Dialog(cs ...Checker) *Node { + return n.find(Dialog, cs) +} + +func (n *Node) Div(cs ...Checker) *Node { + return n.find(Div, cs) +} + +func (n *Node) Dl(cs ...Checker) *Node { + return n.find(Dl, cs) +} + +func (n *Node) Dt(cs ...Checker) *Node { + return n.find(Dt, cs) +} + +func (n *Node) Em(cs ...Checker) *Node { + return n.find(Em, cs) +} + +func (n *Node) Embed(cs ...Checker) *Node { + return n.find(Embed, cs) +} + +func (n *Node) Fieldset(cs ...Checker) *Node { + return n.find(Fieldset, cs) +} + +func (n *Node) Figcaption(cs ...Checker) *Node { + return n.find(Figcaption, cs) +} + +func (n *Node) Figure(cs ...Checker) *Node { + return n.find(Figure, cs) +} + +func (n *Node) Footer(cs ...Checker) *Node { + return n.find(Footer, cs) +} + +func (n *Node) Form(cs ...Checker) *Node { + return n.find(Form, cs) +} + +func (n *Node) H1(cs ...Checker) *Node { + return n.find(H1, cs) +} + +func (n *Node) H2(cs ...Checker) *Node { + return n.find(H2, cs) +} + +func (n *Node) H3(cs ...Checker) *Node { + return n.find(H3, cs) +} + +func (n *Node) H4(cs ...Checker) *Node { + return n.find(H4, cs) +} + +func (n *Node) H5(cs ...Checker) *Node { + return n.find(H5, cs) +} + +func (n *Node) H6(cs ...Checker) *Node { + return n.find(H6, cs) +} + +func (n *Node) Head(cs ...Checker) *Node { + return n.find(Head, cs) +} + +func (n *Node) Header(cs ...Checker) *Node { + return n.find(Header, cs) +} + +func (n *Node) Hgroup(cs ...Checker) *Node { + return n.find(Hgroup, cs) +} + +func (n *Node) Hr(cs ...Checker) *Node { + return n.find(Hr, cs) +} + +func (n *Node) Html(cs ...Checker) *Node { + return n.find(Html, cs) +} + +func (n *Node) I(cs ...Checker) *Node { + return n.find(I, cs) +} + +func (n *Node) Iframe(cs ...Checker) *Node { + return n.find(Iframe, cs) +} + +func (n *Node) Img(cs ...Checker) *Node { + return n.find(Img, cs) +} + +func (n *Node) Input(cs ...Checker) *Node { + return n.find(Input, cs) +} + +func (n *Node) Ins(cs ...Checker) *Node { + return n.find(Ins, cs) +} + +func (n *Node) Kbd(cs ...Checker) *Node { + return n.find(Kbd, cs) +} + +func (n *Node) Keygen(cs ...Checker) *Node { + return n.find(Keygen, cs) +} + +func (n *Node) Label(cs ...Checker) *Node { + return n.find(Label, cs) +} + +func (n *Node) Legend(cs ...Checker) *Node { + return n.find(Legend, cs) +} + +func (n *Node) Li(cs ...Checker) *Node { + return n.find(Li, cs) +} + +func (n *Node) Link(cs ...Checker) *Node { + return n.find(Link, cs) +} + +func (n *Node) Map(cs ...Checker) *Node { + return n.find(Map, cs) +} + +func (n *Node) Mark(cs ...Checker) *Node { + return n.find(Mark, cs) +} + +func (n *Node) Menu(cs ...Checker) *Node { + return n.find(Menu, cs) +} + +func (n *Node) Meta(cs ...Checker) *Node { + return n.find(Meta, cs) +} + +func (n *Node) Meter(cs ...Checker) *Node { + return n.find(Meter, cs) +} + +func (n *Node) Nav(cs ...Checker) *Node { + return n.find(Nav, cs) +} + +func (n *Node) Noscript(cs ...Checker) *Node { + return n.find(Noscript, cs) +} + +func (n *Node) Object(cs ...Checker) *Node { + return n.find(Object, cs) +} + +func (n *Node) Ol(cs ...Checker) *Node { + return n.find(Ol, cs) +} + +func (n *Node) Optgroup(cs ...Checker) *Node { + return n.find(Optgroup, cs) +} + +func (n *Node) Option(cs ...Checker) *Node { + return n.find(Option, cs) +} + +func (n *Node) Output(cs ...Checker) *Node { + return n.find(Output, cs) +} + +func (n *Node) P(cs ...Checker) *Node { + return n.find(P, cs) +} + +func (n *Node) Param(cs ...Checker) *Node { + return n.find(Param, cs) +} + +func (n *Node) Pre(cs ...Checker) *Node { + return n.find(Pre, cs) +} + +func (n *Node) Progress(cs ...Checker) *Node { + return n.find(Progress, cs) +} + +func (n *Node) Q(cs ...Checker) *Node { + return n.find(Q, cs) +} + +func (n *Node) Rp(cs ...Checker) *Node { + return n.find(Rp, cs) +} + +func (n *Node) Rt(cs ...Checker) *Node { + return n.find(Rt, cs) +} + +func (n *Node) Ruby(cs ...Checker) *Node { + return n.find(Ruby, cs) +} + +func (n *Node) S(cs ...Checker) *Node { + return n.find(S, cs) +} + +func (n *Node) Samp(cs ...Checker) *Node { + return n.find(Samp, cs) +} + +func (n *Node) Script(cs ...Checker) *Node { + return n.find(Script, cs) +} + +func (n *Node) Section(cs ...Checker) *Node { + return n.find(Section, cs) +} + +func (n *Node) Select(cs ...Checker) *Node { + return n.find(Select, cs) +} + +func (n *Node) Small(cs ...Checker) *Node { + return n.find(Small, cs) +} + +func (n *Node) Source(cs ...Checker) *Node { + return n.find(Source, cs) +} + +func (n *Node) Span(cs ...Checker) *Node { + return n.find(Span, cs) +} + +func (n *Node) Strong(cs ...Checker) *Node { + return n.find(Strong, cs) +} + +func (n *Node) Style(cs ...Checker) *Node { + return n.find(Style, cs) +} + +func (n *Node) Sub(cs ...Checker) *Node { + return n.find(Sub, cs) +} + +func (n *Node) Summary(cs ...Checker) *Node { + return n.find(Summary, cs) +} + +func (n *Node) Sup(cs ...Checker) *Node { + return n.find(Sup, cs) +} + +func (n *Node) Table(cs ...Checker) *Node { + return n.find(Table, cs) +} + +func (n *Node) Tbody(cs ...Checker) *Node { + return n.find(Tbody, cs) +} + +func (n *Node) Td(cs ...Checker) *Node { + return n.find(Td, cs) +} + +func (n *Node) Textarea(cs ...Checker) *Node { + return n.find(Textarea, cs) +} + +func (n *Node) Tfoot(cs ...Checker) *Node { + return n.find(Tfoot, cs) +} + +func (n *Node) Th(cs ...Checker) *Node { + return n.find(Th, cs) +} + +func (n *Node) Thead(cs ...Checker) *Node { + return n.find(Thead, cs) +} + +func (n *Node) Time(cs ...Checker) *Node { + return n.find(Time, cs) +} + +func (n *Node) Title(cs ...Checker) *Node { + return n.find(Title, cs) +} + +func (n *Node) Tr(cs ...Checker) *Node { + return n.find(Tr, cs) +} + +func (n *Node) Track(cs ...Checker) *Node { + return n.find(Track, cs) +} + +func (n *Node) U(cs ...Checker) *Node { + return n.find(U, cs) +} + +func (n *Node) Ul(cs ...Checker) *Node { + return n.find(Ul, cs) +} + +func (n *Node) Var(cs ...Checker) *Node { + return n.find(Var, cs) +} + +func (n *Node) Video(cs ...Checker) *Node { + return n.find(Video, cs) +} + +func (n *Node) Wbr(cs ...Checker) *Node { + return n.find(Wbr, cs) +} + +func (n *Node) Abbr_(pat ...string) *string { + return n.Attr("abbr", pat...) +} + +func (n *Node) Accept(pat ...string) *string { + return n.Attr("accept", pat...) +} + +func (n *Node) AcceptCharset(pat ...string) *string { + return n.Attr("accept-charset", pat...) +} + +func (n *Node) Accesskey(pat ...string) *string { + return n.Attr("accesskey", pat...) +} + +func (n *Node) Action(pat ...string) *string { + return n.Attr("action", pat...) +} + +func (n *Node) Allowfullscreen(pat ...string) *string { + return n.Attr("allowfullscreen", pat...) +} + +func (n *Node) Alt(pat ...string) *string { + return n.Attr("alt", pat...) +} + +func (n *Node) Async(pat ...string) *string { + return n.Attr("async", pat...) +} + +func (n *Node) Autocomplete(pat ...string) *string { + return n.Attr("autocomplete", pat...) +} + +func (n *Node) Autofocus(pat ...string) *string { + return n.Attr("autofocus", pat...) +} + +func (n *Node) Autoplay(pat ...string) *string { + return n.Attr("autoplay", pat...) +} + +func (n *Node) Challenge(pat ...string) *string { + return n.Attr("challenge", pat...) +} + +func (n *Node) Charset(pat ...string) *string { + return n.Attr("charset", pat...) +} + +func (n *Node) Checked(pat ...string) *string { + return n.Attr("checked", pat...) +} + +func (n *Node) Cite_(pat ...string) *string { + return n.Attr("cite", pat...) +} + +func (n *Node) Class(pat ...string) *string { + return n.Attr("class", pat...) +} + +func (n *Node) Cols(pat ...string) *string { + return n.Attr("cols", pat...) +} + +func (n *Node) Colspan(pat ...string) *string { + return n.Attr("colspan", pat...) +} + +func (n *Node) Command(pat ...string) *string { + return n.Attr("command", pat...) +} + +func (n *Node) Content(pat ...string) *string { + return n.Attr("content", pat...) +} + +func (n *Node) Contenteditable(pat ...string) *string { + return n.Attr("contenteditable", pat...) +} + +func (n *Node) Contextmenu(pat ...string) *string { + return n.Attr("contextmenu", pat...) +} + +func (n *Node) Controls(pat ...string) *string { + return n.Attr("controls", pat...) +} + +func (n *Node) Coords(pat ...string) *string { + return n.Attr("coords", pat...) +} + +func (n *Node) Crossorigin(pat ...string) *string { + return n.Attr("crossorigin", pat...) +} + +func (n *Node) Data_(pat ...string) *string { + return n.Attr("data", pat...) +} + +func (n *Node) Datetime(pat ...string) *string { + return n.Attr("datetime", pat...) +} + +func (n *Node) Default(pat ...string) *string { + return n.Attr("default", pat...) +} + +func (n *Node) Defer(pat ...string) *string { + return n.Attr("defer", pat...) +} + +func (n *Node) Dir(pat ...string) *string { + return n.Attr("dir", pat...) +} + +func (n *Node) Dirname(pat ...string) *string { + return n.Attr("dirname", pat...) +} + +func (n *Node) Disabled(pat ...string) *string { + return n.Attr("disabled", pat...) +} + +func (n *Node) Download(pat ...string) *string { + return n.Attr("download", pat...) +} + +func (n *Node) Draggable(pat ...string) *string { + return n.Attr("draggable", pat...) +} + +func (n *Node) Dropzone(pat ...string) *string { + return n.Attr("dropzone", pat...) +} + +func (n *Node) Enctype(pat ...string) *string { + return n.Attr("enctype", pat...) +} + +func (n *Node) For(pat ...string) *string { + return n.Attr("for", pat...) +} + +func (n *Node) Form_(pat ...string) *string { + return n.Attr("form", pat...) +} + +func (n *Node) Formaction(pat ...string) *string { + return n.Attr("formaction", pat...) +} + +func (n *Node) Formenctype(pat ...string) *string { + return n.Attr("formenctype", pat...) +} + +func (n *Node) Formmethod(pat ...string) *string { + return n.Attr("formmethod", pat...) +} + +func (n *Node) Formnovalidate(pat ...string) *string { + return n.Attr("formnovalidate", pat...) +} + +func (n *Node) Formtarget(pat ...string) *string { + return n.Attr("formtarget", pat...) +} + +func (n *Node) Headers(pat ...string) *string { + return n.Attr("headers", pat...) +} + +func (n *Node) Height(pat ...string) *string { + return n.Attr("height", pat...) +} + +func (n *Node) Hidden(pat ...string) *string { + return n.Attr("hidden", pat...) +} + +func (n *Node) High(pat ...string) *string { + return n.Attr("high", pat...) +} + +func (n *Node) Href(pat ...string) *string { + return n.Attr("href", pat...) +} + +func (n *Node) Hreflang(pat ...string) *string { + return n.Attr("hreflang", pat...) +} + +func (n *Node) HttpEquiv(pat ...string) *string { + return n.Attr("http-equiv", pat...) +} + +func (n *Node) Icon(pat ...string) *string { + return n.Attr("icon", pat...) +} + +func (n *Node) Id(pat ...string) *string { + return n.Attr("id", pat...) +} + +func (n *Node) Inert(pat ...string) *string { + return n.Attr("inert", pat...) +} + +func (n *Node) Inputmode(pat ...string) *string { + return n.Attr("inputmode", pat...) +} + +func (n *Node) Ismap(pat ...string) *string { + return n.Attr("ismap", pat...) +} + +func (n *Node) Itemid(pat ...string) *string { + return n.Attr("itemid", pat...) +} + +func (n *Node) Itemprop(pat ...string) *string { + return n.Attr("itemprop", pat...) +} + +func (n *Node) Itemref(pat ...string) *string { + return n.Attr("itemref", pat...) +} + +func (n *Node) Itemscope(pat ...string) *string { + return n.Attr("itemscope", pat...) +} + +func (n *Node) Itemtype(pat ...string) *string { + return n.Attr("itemtype", pat...) +} + +func (n *Node) Keytype(pat ...string) *string { + return n.Attr("keytype", pat...) +} + +func (n *Node) Kind(pat ...string) *string { + return n.Attr("kind", pat...) +} + +func (n *Node) Label_(pat ...string) *string { + return n.Attr("label", pat...) +} + +func (n *Node) Lang(pat ...string) *string { + return n.Attr("lang", pat...) +} + +func (n *Node) List(pat ...string) *string { + return n.Attr("list", pat...) +} + +func (n *Node) Loop(pat ...string) *string { + return n.Attr("loop", pat...) +} + +func (n *Node) Low(pat ...string) *string { + return n.Attr("low", pat...) +} + +func (n *Node) Manifest(pat ...string) *string { + return n.Attr("manifest", pat...) +} + +func (n *Node) Max(pat ...string) *string { + return n.Attr("max", pat...) +} + +func (n *Node) Maxlength(pat ...string) *string { + return n.Attr("maxlength", pat...) +} + +func (n *Node) Media(pat ...string) *string { + return n.Attr("media", pat...) +} + +func (n *Node) Mediagroup(pat ...string) *string { + return n.Attr("mediagroup", pat...) +} + +func (n *Node) Menu_(pat ...string) *string { + return n.Attr("menu", pat...) +} + +func (n *Node) Method(pat ...string) *string { + return n.Attr("method", pat...) +} + +func (n *Node) Min(pat ...string) *string { + return n.Attr("min", pat...) +} + +func (n *Node) Minlength(pat ...string) *string { + return n.Attr("minlength", pat...) +} + +func (n *Node) Multiple(pat ...string) *string { + return n.Attr("multiple", pat...) +} + +func (n *Node) Muted(pat ...string) *string { + return n.Attr("muted", pat...) +} + +func (n *Node) Name(pat ...string) *string { + return n.Attr("name", pat...) +} + +func (n *Node) Novalidate(pat ...string) *string { + return n.Attr("novalidate", pat...) +} + +func (n *Node) Open(pat ...string) *string { + return n.Attr("open", pat...) +} + +func (n *Node) Optimum(pat ...string) *string { + return n.Attr("optimum", pat...) +} + +func (n *Node) Pattern(pat ...string) *string { + return n.Attr("pattern", pat...) +} + +func (n *Node) Ping(pat ...string) *string { + return n.Attr("ping", pat...) +} + +func (n *Node) Placeholder(pat ...string) *string { + return n.Attr("placeholder", pat...) +} + +func (n *Node) Poster(pat ...string) *string { + return n.Attr("poster", pat...) +} + +func (n *Node) Preload(pat ...string) *string { + return n.Attr("preload", pat...) +} + +func (n *Node) Radiogroup(pat ...string) *string { + return n.Attr("radiogroup", pat...) +} + +func (n *Node) Readonly(pat ...string) *string { + return n.Attr("readonly", pat...) +} + +func (n *Node) Rel(pat ...string) *string { + return n.Attr("rel", pat...) +} + +func (n *Node) Required(pat ...string) *string { + return n.Attr("required", pat...) +} + +func (n *Node) Reversed(pat ...string) *string { + return n.Attr("reversed", pat...) +} + +func (n *Node) Rows(pat ...string) *string { + return n.Attr("rows", pat...) +} + +func (n *Node) Rowspan(pat ...string) *string { + return n.Attr("rowspan", pat...) +} + +func (n *Node) Sandbox(pat ...string) *string { + return n.Attr("sandbox", pat...) +} + +func (n *Node) Scope(pat ...string) *string { + return n.Attr("scope", pat...) +} + +func (n *Node) Scoped(pat ...string) *string { + return n.Attr("scoped", pat...) +} + +func (n *Node) Seamless(pat ...string) *string { + return n.Attr("seamless", pat...) +} + +func (n *Node) Selected(pat ...string) *string { + return n.Attr("selected", pat...) +} + +func (n *Node) Shape(pat ...string) *string { + return n.Attr("shape", pat...) +} + +func (n *Node) Size(pat ...string) *string { + return n.Attr("size", pat...) +} + +func (n *Node) Sizes(pat ...string) *string { + return n.Attr("sizes", pat...) +} + +func (n *Node) Sortable(pat ...string) *string { + return n.Attr("sortable", pat...) +} + +func (n *Node) Sorted(pat ...string) *string { + return n.Attr("sorted", pat...) +} + +func (n *Node) Span_(pat ...string) *string { + return n.Attr("span", pat...) +} + +func (n *Node) Spellcheck(pat ...string) *string { + return n.Attr("spellcheck", pat...) +} + +func (n *Node) Src(pat ...string) *string { + return n.Attr("src", pat...) +} + +func (n *Node) Srcdoc(pat ...string) *string { + return n.Attr("srcdoc", pat...) +} + +func (n *Node) Srclang(pat ...string) *string { + return n.Attr("srclang", pat...) +} + +func (n *Node) Srcset(pat ...string) *string { + return n.Attr("srcset", pat...) +} + +func (n *Node) Start(pat ...string) *string { + return n.Attr("start", pat...) +} + +func (n *Node) Step(pat ...string) *string { + return n.Attr("step", pat...) +} + +func (n *Node) Style_(pat ...string) *string { + return n.Attr("style", pat...) +} + +func (n *Node) Tabindex(pat ...string) *string { + return n.Attr("tabindex", pat...) +} + +func (n *Node) Target(pat ...string) *string { + return n.Attr("target", pat...) +} + +func (n *Node) Title_(pat ...string) *string { + return n.Attr("title", pat...) +} + +func (n *Node) Translate(pat ...string) *string { + return n.Attr("translate", pat...) +} + +func (n *Node) Type(pat ...string) *string { + return n.Attr("type", pat...) +} + +func (n *Node) Typemustmatch(pat ...string) *string { + return n.Attr("typemustmatch", pat...) +} + +func (n *Node) Usemap(pat ...string) *string { + return n.Attr("usemap", pat...) +} + +func (n *Node) Value(pat ...string) *string { + return n.Attr("value", pat...) +} + +func (n *Node) Width(pat ...string) *string { + return n.Attr("width", pat...) +} + +func (n *Node) Wrap(pat ...string) *string { + return n.Attr("wrap", pat...) +} diff --git a/chain.go b/chain.go new file mode 100644 index 0000000..ba4baf1 --- /dev/null +++ b/chain.go @@ -0,0 +1,84 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package query + +import ( + . "github.com/hailiang/html-query/expr" + "regexp" +) + +func (n *Node) Is(cs ...Checker) bool { + return And(cs...)(n.InternalNode()) != nil +} + +func (n *Node) Find(cs ...Checker) *Node { + return NewNode(Find(cs...)(&n.n)) +} + +func (n *Node) FindChild(cs ...Checker) *Node { + return NewNode(FindChild(cs...)(&n.n)) +} + +func (n *Node) find(c Checker, cs []Checker) *Node { + if n == nil { + return nil + } + return n.Find(append([]Checker{c}, cs...)...) +} + +func (n *Node) NextSibling() *Node { + if n == nil { + return nil + } + return NewNode(NextSibling(&n.n)) +} + +func (n *Node) PrevSibling() *Node { + if n == nil { + return nil + } + return NewNode(PrevSibling(&n.n)) +} + +func (n *Node) Parent() *Node { + if n == nil { + return nil + } + return NewNode(Parent(&n.n)) +} + +func (n *Node) Children(cs ...Checker) NodeIter { + if n == nil { + return NodeIter{nil} + } + return NodeIter{Children(&n.n, cs...)} +} + +func (n *Node) Descendants(cs ...Checker) NodeIter { + if n == nil { + return NodeIter{nil} + } + return NodeIter{Descendants(&n.n, cs...)} +} + +func (n *Node) Ahref(cs ...Checker) *Node { + if n == nil { + return nil + } + return n.find(Ahref, cs) +} + +func (n *Node) TextNode(pat string) *TextNodeNode { + if n == nil { + return nil + } + rx := regexp.MustCompile(pat) + cs := []Checker{Text_(rx)} + return NewTextNodeNode(n.find(TextNode, cs), rx) +} + +func also(c Checker, cs []Checker) []Checker { + return append([]Checker{c}, cs...) +} diff --git a/examples/main.go b/examples/main.go new file mode 100644 index 0000000..784f22c --- /dev/null +++ b/examples/main.go @@ -0,0 +1,54 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "fmt" + "github.com/hailiang/html-query" + . "github.com/hailiang/html-query/expr" + "io" + "net/http" +) + +func main() { + r := get(`http://blog.golang.org/index`) + defer r.Close() + root, err := query.Parse(r) + checkError(err) + root.Div(Id("content")).Children(Class("blogtitle")).For(func(item *query.Node) { + href := item.Ahref().Href() + date := item.Span(Class("date")).Text() + tags := item.Span(Class("tags")).Text() + if href != nil { + pn(*href) + } + if date != nil { + pn(*date) + } + if tags != nil { + p(*tags) + } + }) +} + +func get(url string) io.ReadCloser { + resp, err := http.Get(url) + checkError(err) + return resp.Body +} + +func checkError(err error) { + if err != nil { + panic(err) + } +} + +func pn(v ...interface{}) { + fmt.Print(v...) +} + +func p(v ...interface{}) { + fmt.Println(v...) +} diff --git a/expr/auto_expr.go b/expr/auto_expr.go new file mode 100644 index 0000000..119eb59 --- /dev/null +++ b/expr/auto_expr.go @@ -0,0 +1,247 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "code.google.com/p/go.net/html/atom" +) + +var ( + A = ElementChecker(atom.A) + Abbr = ElementChecker(atom.Abbr) + Address = ElementChecker(atom.Address) + Area = ElementChecker(atom.Area) + Article = ElementChecker(atom.Article) + Aside = ElementChecker(atom.Aside) + Audio = ElementChecker(atom.Audio) + B = ElementChecker(atom.B) + Base = ElementChecker(atom.Base) + Bdi = ElementChecker(atom.Bdi) + Bdo = ElementChecker(atom.Bdo) + Blockquote = ElementChecker(atom.Blockquote) + Body = ElementChecker(atom.Body) + Br = ElementChecker(atom.Br) + Button = ElementChecker(atom.Button) + Canvas = ElementChecker(atom.Canvas) + Caption = ElementChecker(atom.Caption) + Cite = ElementChecker(atom.Cite) + Code = ElementChecker(atom.Code) + Col = ElementChecker(atom.Col) + Colgroup = ElementChecker(atom.Colgroup) + Data = ElementChecker(atom.Data) + Datalist = ElementChecker(atom.Datalist) + Dd = ElementChecker(atom.Dd) + Del = ElementChecker(atom.Del) + Details = ElementChecker(atom.Details) + Dfn = ElementChecker(atom.Dfn) + Dialog = ElementChecker(atom.Dialog) + Div = ElementChecker(atom.Div) + Dl = ElementChecker(atom.Dl) + Dt = ElementChecker(atom.Dt) + Em = ElementChecker(atom.Em) + Embed = ElementChecker(atom.Embed) + Fieldset = ElementChecker(atom.Fieldset) + Figcaption = ElementChecker(atom.Figcaption) + Figure = ElementChecker(atom.Figure) + Footer = ElementChecker(atom.Footer) + Form = ElementChecker(atom.Form) + H1 = ElementChecker(atom.H1) + H2 = ElementChecker(atom.H2) + H3 = ElementChecker(atom.H3) + H4 = ElementChecker(atom.H4) + H5 = ElementChecker(atom.H5) + H6 = ElementChecker(atom.H6) + Head = ElementChecker(atom.Head) + Header = ElementChecker(atom.Header) + Hgroup = ElementChecker(atom.Hgroup) + Hr = ElementChecker(atom.Hr) + Html = ElementChecker(atom.Html) + I = ElementChecker(atom.I) + Iframe = ElementChecker(atom.Iframe) + Img = ElementChecker(atom.Img) + Input = ElementChecker(atom.Input) + Ins = ElementChecker(atom.Ins) + Kbd = ElementChecker(atom.Kbd) + Keygen = ElementChecker(atom.Keygen) + Label = ElementChecker(atom.Label) + Legend = ElementChecker(atom.Legend) + Li = ElementChecker(atom.Li) + Link = ElementChecker(atom.Link) + Map = ElementChecker(atom.Map) + Mark = ElementChecker(atom.Mark) + Menu = ElementChecker(atom.Menu) + Meta = ElementChecker(atom.Meta) + Meter = ElementChecker(atom.Meter) + Nav = ElementChecker(atom.Nav) + Noscript = ElementChecker(atom.Noscript) + Object = ElementChecker(atom.Object) + Ol = ElementChecker(atom.Ol) + Optgroup = ElementChecker(atom.Optgroup) + Option = ElementChecker(atom.Option) + Output = ElementChecker(atom.Output) + P = ElementChecker(atom.P) + Param = ElementChecker(atom.Param) + Pre = ElementChecker(atom.Pre) + Progress = ElementChecker(atom.Progress) + Q = ElementChecker(atom.Q) + Rp = ElementChecker(atom.Rp) + Rt = ElementChecker(atom.Rt) + Ruby = ElementChecker(atom.Ruby) + S = ElementChecker(atom.S) + Samp = ElementChecker(atom.Samp) + Script = ElementChecker(atom.Script) + Section = ElementChecker(atom.Section) + Select = ElementChecker(atom.Select) + Small = ElementChecker(atom.Small) + Source = ElementChecker(atom.Source) + Span = ElementChecker(atom.Span) + Strong = ElementChecker(atom.Strong) + Style = ElementChecker(atom.Style) + Sub = ElementChecker(atom.Sub) + Summary = ElementChecker(atom.Summary) + Sup = ElementChecker(atom.Sup) + Table = ElementChecker(atom.Table) + Tbody = ElementChecker(atom.Tbody) + Td = ElementChecker(atom.Td) + Textarea = ElementChecker(atom.Textarea) + Tfoot = ElementChecker(atom.Tfoot) + Th = ElementChecker(atom.Th) + Thead = ElementChecker(atom.Thead) + Time = ElementChecker(atom.Time) + Title = ElementChecker(atom.Title) + Tr = ElementChecker(atom.Tr) + Track = ElementChecker(atom.Track) + U = ElementChecker(atom.U) + Ul = ElementChecker(atom.Ul) + Var = ElementChecker(atom.Var) + Video = ElementChecker(atom.Video) + Wbr = ElementChecker(atom.Wbr) +) + +var ( + Abbr_ = AttrChecker("abbr") + Accept = AttrChecker("accept") + AcceptCharset = SeperatedAttrChecker("accept-charset", ' ') + Accesskey = SeperatedAttrChecker("accesskey", ' ') + Action = AttrChecker("action") + Allowfullscreen = AttrChecker("allowfullscreen") + Alt = AttrChecker("alt") + Async = AttrChecker("async") + Autocomplete = AttrChecker("autocomplete") + Autofocus = AttrChecker("autofocus") + Autoplay = AttrChecker("autoplay") + Challenge = AttrChecker("challenge") + Charset = AttrChecker("charset") + Checked = AttrChecker("checked") + Cite_ = AttrChecker("cite") + Class = SeperatedAttrChecker("class", ' ') + Cols = AttrChecker("cols") + Colspan = AttrChecker("colspan") + Command = AttrChecker("command") + Content = AttrChecker("content") + Contenteditable = AttrChecker("contenteditable") + Contextmenu = AttrChecker("contextmenu") + Controls = AttrChecker("controls") + Coords = AttrChecker("coords") + Crossorigin = AttrChecker("crossorigin") + Data_ = AttrChecker("data") + Datetime = AttrChecker("datetime") + Default = AttrChecker("default") + Defer = AttrChecker("defer") + Dir = AttrChecker("dir") + Dirname = AttrChecker("dirname") + Disabled = AttrChecker("disabled") + Download = AttrChecker("download") + Draggable = AttrChecker("draggable") + Dropzone = SeperatedAttrChecker("dropzone", ' ') + Enctype = AttrChecker("enctype") + For = AttrChecker("for") + Form_ = AttrChecker("form") + Formaction = AttrChecker("formaction") + Formenctype = AttrChecker("formenctype") + Formmethod = AttrChecker("formmethod") + Formnovalidate = AttrChecker("formnovalidate") + Formtarget = AttrChecker("formtarget") + Headers = SeperatedAttrChecker("headers", ' ') + Height = AttrChecker("height") + Hidden = AttrChecker("hidden") + High = AttrChecker("high") + Href = AttrChecker("href") + Hreflang = AttrChecker("hreflang") + HttpEquiv = AttrChecker("http-equiv") + Icon = AttrChecker("icon") + Id = AttrChecker("id") + Inert = AttrChecker("inert") + Inputmode = AttrChecker("inputmode") + Ismap = AttrChecker("ismap") + Itemid = AttrChecker("itemid") + Itemprop = SeperatedAttrChecker("itemprop", ' ') + Itemref = SeperatedAttrChecker("itemref", ' ') + Itemscope = AttrChecker("itemscope") + Itemtype = SeperatedAttrChecker("itemtype", ' ') + Keytype = AttrChecker("keytype") + Kind = AttrChecker("kind") + Label_ = AttrChecker("label") + Lang = AttrChecker("lang") + List = AttrChecker("list") + Loop = AttrChecker("loop") + Low = AttrChecker("low") + Manifest = AttrChecker("manifest") + Max = AttrChecker("max") + Maxlength = AttrChecker("maxlength") + Media = AttrChecker("media") + Mediagroup = AttrChecker("mediagroup") + Menu_ = AttrChecker("menu") + Method = AttrChecker("method") + Min = AttrChecker("min") + Minlength = AttrChecker("minlength") + Multiple = AttrChecker("multiple") + Muted = AttrChecker("muted") + Name = AttrChecker("name") + Novalidate = AttrChecker("novalidate") + Open = AttrChecker("open") + Optimum = AttrChecker("optimum") + Pattern = AttrChecker("pattern") + Ping = SeperatedAttrChecker("ping", ' ') + Placeholder = AttrChecker("placeholder") + Poster = AttrChecker("poster") + Preload = AttrChecker("preload") + Radiogroup = AttrChecker("radiogroup") + Readonly = AttrChecker("readonly") + Rel = SeperatedAttrChecker("rel", ' ') + Required = AttrChecker("required") + Reversed = AttrChecker("reversed") + Rows = AttrChecker("rows") + Rowspan = AttrChecker("rowspan") + Sandbox = SeperatedAttrChecker("sandbox", ' ') + Scope = AttrChecker("scope") + Scoped = AttrChecker("scoped") + Seamless = AttrChecker("seamless") + Selected = AttrChecker("selected") + Shape = AttrChecker("shape") + Size = AttrChecker("size") + Sizes = SeperatedAttrChecker("sizes", ' ') + Sortable = AttrChecker("sortable") + Sorted = SeperatedAttrChecker("sorted", ' ') + Span_ = AttrChecker("span") + Spellcheck = AttrChecker("spellcheck") + Src = AttrChecker("src") + Srcdoc = AttrChecker("srcdoc") + Srclang = AttrChecker("srclang") + Srcset = AttrChecker("srcset") + Start = AttrChecker("start") + Step = AttrChecker("step") + Style_ = AttrChecker("style") + Tabindex = AttrChecker("tabindex") + Target = AttrChecker("target") + Title_ = AttrChecker("title") + Translate = AttrChecker("translate") + Type = AttrChecker("type") + Typemustmatch = AttrChecker("typemustmatch") + Usemap = AttrChecker("usemap") + Value = AttrChecker("value") + Width = AttrChecker("width") + Wrap = AttrChecker("wrap") +) diff --git a/expr/bfs.go b/expr/bfs.go new file mode 100644 index 0000000..297358b --- /dev/null +++ b/expr/bfs.go @@ -0,0 +1,85 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "container/list" + "code.google.com/p/go.net/html" +) + +// Broad first search in all descendants +func Find(cs ...Checker) Checker { + c := And(cs...) + return func(n *html.Node) *html.Node { + q := NewQueue() + q.PushNodes(Children(n)) + for q.Len() > 0 { + t := q.Pop() + if c(t) != nil { + return t + } else { + q.PushNodes(Children(t)) + } + } + return nil + } +} + +// Find in direct children +func FindChild(cs ...Checker) Checker { + c := And(cs...) + return func(n *html.Node) *html.Node { + for child := FirstChild(n); child != nil; child = NextSibling(child) { + if c(child) != nil { + return child + } + } + return nil + } +} + +// Find in sibling nodes +func FindSibling(cs ...Checker) Checker { + c := And(cs...) + return func(n *html.Node) *html.Node { + for sibling := NextSibling(n); sibling != nil; sibling = NextSibling(sibling) { + if c(sibling) != nil { + return sibling + } + } + return nil + } +} + +// FIFO queue. +type Queue struct { + l *list.List +} + +func NewQueue() *Queue { + return &Queue{list.New()} +} + +func (q *Queue) Len() int { + return q.l.Len() +} + +func (q *Queue) Push(n *html.Node) { + q.l.PushBack(n) +} + +func (q *Queue) PushNodes(next Iter) { + for node := next(); node != nil; node = next() { + q.Push(node) + } +} + +func (q *Queue) Pop() *html.Node { + if q.l.Front() == nil { + return nil + } + return q.l.Remove(q.l.Front()).(*html.Node) +} + diff --git a/expr/checker.go b/expr/checker.go new file mode 100644 index 0000000..8b0f4f8 --- /dev/null +++ b/expr/checker.go @@ -0,0 +1,221 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "code.google.com/p/go.net/html" + "code.google.com/p/go.net/html/atom" + "regexp" + "strings" +) + +type Checker func(*html.Node) *html.Node + +func Not(c Checker) Checker { + return func(n *html.Node) *html.Node { + if c(n) == nil { + return n + } + return nil + } +} + +func And(cs ...Checker) Checker { + return func(n *html.Node) *html.Node { + for _, c := range cs { + if c(n) == nil { + return nil + } + } + return n + } +} + +func Pipe(cs ...Checker) Checker { + return func(n *html.Node) *html.Node { + for _, c := range cs { + r := c(n) + if r == nil { + return nil + } else { + n = r + } + } + return n + } +} + +func Or(cs ...Checker) Checker { + return func(n *html.Node) *html.Node { + for _, c := range cs { + if c(n) != nil { + return n + } + } + return nil + } +} + +func FirstChild(n *html.Node) *html.Node { + if n == nil { + return nil + } + return n.FirstChild +} + +func Parent(n *html.Node) *html.Node { + if n == nil { + return nil + } + return n.Parent +} + +func NextSibling(n *html.Node) *html.Node { + if n == nil { + return nil + } + return n.NextSibling +} + +func PrevSibling(n *html.Node) *html.Node { + if n == nil { + return nil + } + return n.PrevSibling +} + +// Node Checkers +// ============= + +func TypeChecker(t html.NodeType) Checker { + return func(n *html.Node) *html.Node { + if n != nil && n.Type == t { + return n + } + return nil + } +} + +var ( + ErrorNode = TypeChecker(html.ErrorNode) + TextNode = TypeChecker(html.TextNode) + DocumentNode = TypeChecker(html.DocumentNode) + ElementNode = TypeChecker(html.ElementNode) + CommentNode = TypeChecker(html.CommentNode) + DoctypeNode = TypeChecker(html.DoctypeNode) +) + +func NonemptyTextNode (n *html.Node) *html.Node { + if n == nil { + return nil + } + if TextNode(n) != nil && strings.TrimSpace(n.Data) != "" { + return n + } + return nil +} + +func AtomChecker(a atom.Atom) Checker { + return func(n *html.Node) *html.Node { + if n.DataAtom == a { + return n + } + return nil + } +} + +func ElementChecker(a atom.Atom) Checker { + return And(ElementNode, AtomChecker(a)) +} + +// Attribute Checkers +// ================== + +func AttributeCmpChecker(key string, cmp func(string) bool) Checker { + return func(n *html.Node) *html.Node { + attr := GetAttr(n, key) + if attr != nil && cmp(*attr) { + return n + } + return nil + } +} + +func Attr(key, pat string) Checker { + rx := regexp.MustCompile(pat) + return AttributeCmpChecker(key, func(val string) bool { + return rx.MatchString(val) + }) +} + +func AttrChecker(key string) func(string) Checker { + return func(pat string) Checker { + return Attr(key, pat) + } +} + +func HasAttr(key string) Checker { + return func(n *html.Node) *html.Node { + if GetAttr(n, key) != nil { + return n + } + return nil + } +} + +func NoAttr(key string) Checker { + return func(n *html.Node) *html.Node { + if GetAttr(n, key) != nil { + return nil + } + return n + } +} + +func fieldsToSet(val string, sep rune) map[string]bool { + m := make(map[string]bool) + fields := strings.FieldsFunc(val, func(r rune) bool { return r == sep }) + for _, field := range fields { + m[field] = true + } + return m +} + +func SeperatedAttrChecker(name string, sep rune) func(...string) Checker { + return func(classes ...string) Checker { + return AttributeCmpChecker(name, func(val string) bool { + s := fieldsToSet(val, sep) + for _, class := range classes { + if !s[class] { + return false + } + } + return true + }) + } +} + +func Text_(rx *regexp.Regexp) Checker { + return func(n *html.Node) *html.Node { + if s := GetText(n); s != nil { + if rx.MatchString(*s) { + return n + } + } + return nil + } +} + +func Text(pat string) Checker { + return Text_(regexp.MustCompile(pat)) +} + +func CaptionText(pat string) Checker { + return Find(Caption, Text(pat)) +} + +var ( + Ahref = And(A, HasAttr("href")) +) diff --git a/expr/dfs.go b/expr/dfs.go new file mode 100644 index 0000000..e772a33 --- /dev/null +++ b/expr/dfs.go @@ -0,0 +1,116 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "code.google.com/p/go.net/html" + "strconv" +) + +type Iter func() *html.Node + +// Pre-order depth first traversal in all descendants +func Descendants(n *html.Node, cs ...Checker) Iter { + c := And(cs...) + s := NewStack() + node := FirstChild(n) + return func() *html.Node { + for node != nil || s.Len() > 0 { + if node != nil { + s.Push(node) + r := node + node = FirstChild(node) + if c(r) != nil { + return r + } + } else { + node = s.Pop() + node = NextSibling(node) + } + } + return nil + } +} + +func IterIter(next Iter, cs ...Checker) Iter { + find := Find(cs...) + return func() *html.Node { + node := next() + for node != nil { + if n := find(node); n != nil { + return n + } + } + return nil + } +} + +func Children(n *html.Node, cs ...Checker) Iter { + c := And(cs...) + node := FirstChild(n) + return func() *html.Node { + for node != nil { + r := node + node = NextSibling(node) + if c(r) != nil { + return r + } + } + return nil + } +} + +func Strings(next Iter, f StringGetter, pat ...string) []string { + ss := []string{} + p := GetPat(pat) + + // TODO: I have met a bug here once that the program hangs at the next() + // function call, but I cannot find the data to reproduce it. So Just wait + // and see it happens again. + for node := next(); node != nil; node = next() { + if s := f(node); s != nil { + ss = append(ss, *GetSubmatch(s, p)) + } + } + return ss +} + +func Integers(next Iter, f StringGetter) []int { + ss := []int{} + for node := next(); node != nil; node = next() { + s := f(node) + if s != nil { + if i, err := strconv.Atoi(*s); err == nil { + ss = append(ss, i) + } + } + } + return ss +} + +// FILO stack. +type Stack struct { + s []*html.Node +} + +func NewStack() *Stack { + return &Stack{} +} + +func (s *Stack) Len() int { + return len(s.s) +} + +func (s *Stack) Push(n *html.Node) { + s.s = append(s.s, n) +} + +func (s *Stack) Pop() (n *html.Node) { + if s.Len() == 0 { + return nil + } + n, s.s = s.s[len(s.s)-1], s.s[:len(s.s)-1] + return n +} diff --git a/expr/getter.go b/expr/getter.go new file mode 100644 index 0000000..6d600aa --- /dev/null +++ b/expr/getter.go @@ -0,0 +1,90 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "code.google.com/p/go.net/html" + "regexp" +) + +type StringGetter func(*html.Node) *string + +func GetAttr(n *html.Node, key string) *string { + if n == nil { + return nil + } + for _, a := range n.Attr { + if a.Key == key { + return &a.Val + } + } + return nil +} + +func GetAttrSubmatch(n *html.Node, key, pat string) *string { + return GetSubmatch(GetAttr(n, key), pat) +} + +func GetSubmatch_(s *string, rx *regexp.Regexp) *string { + if s == nil { + return nil + } + m := rx.FindStringSubmatch(*s) + if m == nil || len(m) < 2 { + return nil + } + return &m[1] +} + +func GetSubmatch(s *string, pat string) *string { + if pat == "" { + return s + } + return GetSubmatch_(s, regexp.MustCompile(pat)) +} + +func GetTextNodeText(n *html.Node) *string { + if NonemptyTextNode(n) != nil { + return &n.Data + } + return nil +} + +func GetText(n *html.Node) *string { + if s := GetTextNodeText(n); s != nil { + return s + } + + for c := FirstChild(n); c != nil; c = NextSibling(c) { + if s := GetTextNodeText(c); s != nil { + return s + } + } + return nil +} + +func GetHref(n *html.Node) *string { + if n == nil { + return nil + } + return GetAttr(n, "href") +} + +func GetPat(pat []string) string { + if len(pat) > 1 { + panic("pat should be either ommited or only one string.") + } else if len(pat) == 0 { + return "" // empty string indicates that the whole string should be got. + } + return pat[0] +} + +/* +func AttrValueGetter(key string) StringGetter { + return func(n *html.Node) *string { + return GetAttrValue(n, key) + } +} +*/ diff --git a/expr/util.go b/expr/util.go new file mode 100644 index 0000000..7336239 --- /dev/null +++ b/expr/util.go @@ -0,0 +1,56 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package expr + +import ( + "strconv" + "strings" + "time" +) + +func ToInt(ps *string) *int { + if ps == nil { + return nil + } + i, err := strconv.Atoi(strings.TrimSpace(*ps)) + if err != nil { + return nil + } + return &i +} + +func ToFloat(ps *string) *float64 { + if ps == nil { + return nil + } + f, err := strconv.ParseFloat(strings.TrimSpace(*ps), 64) + if err != nil { + return nil + } + return &f +} + +func ToHex(ps *string) *int { + if ps == nil { + return nil + } + i64, err := strconv.ParseInt(strings.TrimSpace(*ps), 16, 64) + if err != nil { + return nil + } + i := int(i64) + return &i +} + +func ToTime(ps *string, layout string) *time.Time { + if ps == nil { + return nil + } + t, err := time.Parse(layout, *ps) + if err != nil { + return nil + } + return &t +} diff --git a/gen/gen.go b/gen/gen.go new file mode 100644 index 0000000..95d3f46 --- /dev/null +++ b/gen/gen.go @@ -0,0 +1,180 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "fmt" + "io" + "os" + "os/exec" + "strings" +) + +func (spec *Spec) GenerateExpr() { + elemTable, attrTable := spec.ElemTable, spec.AttrTable + file := "output/auto_expr.go" + f, err := os.Create(file) + c(err) + fp(f, "package expr") + fp(f, "import (") + fp(f, `"code.google.com/p/go.net/html/atom"`) + fp(f, ")") + + fp(f, "var (") + for _, elem := range elemTable.Elements() { + if elemTable.Skip[elem.Name] { + continue + } + nodeId := toid(elem.Name) + fp(f, nodeId, "=", "ElementChecker(atom.", toid(elem.Name), ")") + } + fp(f, ")") + fp(f, "") + + fp(f, "var (") + for _, attr := range attrTable.Attributes() { + attrId := toid(attr.Name) + if elemTable.Set[attr.Name] != nil { + attrId += "_" + } + if strings.Contains(attr.Type, "space-separated tokens") { + fp(f, attrId, ` = SeperatedAttrChecker("`, attr.Name, `", ' ')`) + } else { + fp(f, attrId, ` = AttrChecker("`, attr.Name, `")`) + } + } + fp(f, ")") + + f.Close() + format(file) +} + +func (spec *Spec) GenerateChain() { + file := "output/auto_chain.go" + f, err := os.Create(file) + c(err) + fp(f, "package query") + fp(f, "import (") + fp(f, `. "github.com/hailiang/html-query/expr"`) + fp(f, ")") + + spec.generateChainSmall(f) + + //spec.generateChainBloaded(f) + + fp(f, "") + f.Close() + format(file) +} + +func (spec *Spec) generateChainSmall(f io.Writer) { + elemTable, attrTable := spec.ElemTable, spec.AttrTable + for _, elem := range elemTable.Elements() { + if elemTable.Skip[elem.Name] { + continue + } + nodeId := toid(elem.Name) + nodeFinderSmall(f, nodeId) + } + + for _, attr := range attrTable.Attributes() { + attrId := toid(attr.Name) + if elemTable.Set[attr.Name] != nil { + attrId += "_" + } + nodeAttribute(f, attr.Name, attrId, "Node") + } +} + +func (spec *Spec) generateChainBloaded(f io.Writer) { + elemTable, attrTable := spec.ElemTable, spec.AttrTable + for _, elem := range elemTable.Elements() { + if elemTable.Skip[elem.Name] { + continue + } + nodeId := toid(elem.Name) + nodeType := nodeId + "Node" + fp(f, "") + fp(f, "// ", nodeId) + fp(f, "") + nodeDefinition(f, nodeType) + nodeConstructor(f, nodeType) + finderName := nodeId + if a, ok := attrTable.Set[elem.Name]; ok && a.IsGlobal { + finderName += "Node" + } + nodeFinder(f, finderName, nodeId, nodeType) + for _, attr := range elem.Attributes { + if !attr.IsGlobal { + attrId := toid(attr.Name) + nodeAttribute(f, attr.Name, attrId, nodeType) + } + } + fp(f, "") + } + + for _, attr := range attrTable.Attributes() { + if attr.IsGlobal { + attrId := toid(attr.Name) + nodeAttribute(f, attr.Name, attrId, "Node") + } + } +} + +func nodeDefinition(f io.Writer, name string) { + fp(f, "type ", name, " struct {") + fp(f, "Node") + fp(f, "}") + fp(f, "") +} + +func nodeConstructor(f io.Writer, name string) { + fp(f, "func New", name, "(n *Node) *", name, "{") + fp(f, "if n == nil {") + fp(f, "return nil") + fp(f, "}") + fp(f, "return &", name, "{*n}") + fp(f, "}") + fp(f, "") +} + +func nodeFinder(f io.Writer, finderName, nodeId, nodeType string) { + fp(f, "func (n *Node) ", finderName, "(cs ...Checker) *", nodeType, " {") + fp(f, "return New", nodeType, "(n.find(", nodeId, ", cs))") + fp(f, "}") + fp(f, "") +} + +func nodeFinderSmall(f io.Writer, nodeId string) { + fp(f, "func (n *Node) ", nodeId, "(cs ...Checker) *Node {") + fp(f, "return n.find(", nodeId, ", cs)") + fp(f, "}") + fp(f, "") +} + +func nodeAttribute(f io.Writer, attrName, attrId, nodeType string) { + fp(f, "func (n *", nodeType, ") ", attrId, "(pat ...string) *string {") + fp(f, `return n.Attr("`, attrName, `", pat...)`) + fp(f, "}") + fp(f, "") + +} + +func toid(s string) string { + return strings.Replace(strings.Title(s), "-", "", -1) +} + +func format(file string) { + cmd := exec.Command("go", "fmt", file) + err := cmd.Start() + c(err) + err = cmd.Wait() + c(err) +} + +func fp(w io.Writer, v ...interface{}) { + fmt.Fprint(w, v...) + fmt.Fprintln(w) +} diff --git a/gen/main.go b/gen/main.go new file mode 100644 index 0000000..deecb84 --- /dev/null +++ b/gen/main.go @@ -0,0 +1,26 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "os" +) + +func main() { + htmlSpec := "html_spec.htm" + f, err := os.Open(htmlSpec) + if err != nil { + DownloadSpec(htmlSpec) + f, err = os.Open(htmlSpec) + c(err) + } + defer f.Close() + + spec := parseSpec(f) + + os.Mkdir("output", 0755) + spec.GenerateExpr() + spec.GenerateChain() +} diff --git a/gen/spec.go b/gen/spec.go new file mode 100644 index 0000000..208cdd4 --- /dev/null +++ b/gen/spec.go @@ -0,0 +1,150 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "github.com/hailiang/html-query" + . "github.com/hailiang/html-query/expr" + "io" + "net/http" + "os" + "sort" + "strings" +) + +const ( + SpecUrl = `http://www.whatwg.org/specs/web-apps/current-work/multipage/section-index.html` + +// SPEC_URL = `http://www.w3.org/html/wg/drafts/html/master/single-page.html` +) + +// skip the two because they does not appear in exp/html +var ignoreList = map[string]bool{ + "main": true, + "menuitem": true, + "template": true, +} + +type Element struct { + Name string + Attributes []*Attribute +} + +type Attribute struct { + Name string + Type string + IsGlobal bool +} + +type AttributeTable struct { + Set map[string]*Attribute +} + +type ElementTable struct { + Set map[string]*Element + Skip map[string]bool +} + +type Spec struct { + ElemTable *ElementTable + AttrTable *AttributeTable +} + +func DownloadSpec(file string) { + resp, err := http.Get(SpecUrl) + c(err) + defer resp.Body.Close() + f, err := os.Create(file) + c(err) + defer f.Close() + io.Copy(f, resp.Body) +} + +func parseSpec(file io.Reader) *Spec { + root, err := query.Parse(file) + c(err) + attrTable := parseAttributeTable(root) + elemTable := parseElementTable(root, attrTable) + return &Spec{elemTable, attrTable} +} + +func (t *ElementTable) Elements() []*Element { + names := make([]string, len(t.Set)) + i := 0 + for k, _ := range t.Set { + names[i] = k + i++ + } + sort.Sort(sort.StringSlice(names)) + elements := make([]*Element, len(names)) + for i := range elements { + elements[i] = t.Set[names[i]] + } + return elements +} + +func (t *AttributeTable) Attributes() []*Attribute { + names := make([]string, len(t.Set)) + i := 0 + for k, _ := range t.Set { + names[i] = k + i++ + } + sort.Sort(sort.StringSlice(names)) + attrs := make([]*Attribute, len(names)) + for i := range attrs { + attrs[i] = t.Set[names[i]] + } + return attrs +} + +func parseAttributeTable(root *query.Node) *AttributeTable { + attrSet := make(map[string]*Attribute) + attrTable := root.Table(CaptionText("List of attributes")) + if attrTable == nil { + panic("Cannot find List of attributes") + } + for _, tr := range attrTable.Tbody().Children(Tr).All() { + name := *tr.Th().Code().Text() + attr := &Attribute{Name: name} + td := tr.Children(Td).All() + if elemName := td[0].A().Text(); elemName != nil { + if *elemName == "HTML elements" { + attr.IsGlobal = true + } + } + attr.Type = strings.Replace(td[2].PlainText(), "\n", "", -1) + // Attention: attribute may be duplicated, just choose the first one + // but set isglobal if one of it is global + if attrSet[name] == nil { + attrSet[name] = attr + } else if attr.IsGlobal { + attrSet[name].IsGlobal = true + } + } + return &AttributeTable{Set: attrSet} +} +func parseElementTable(root *query.Node, attrTable *AttributeTable) *ElementTable { + elemSet := make(map[string]*Element) + attrSet := attrTable.Set + elementTable := root.Table(CaptionText("List of elements")) + for _, tr := range elementTable.Tbody().Children(Tr).All() { + td := tr.Children(Td).All() + for _, elemLink := range tr.Th().Descendants(Ahref).All() { + elem := &Element{Name: *elemLink.Text()} + for _, attrLink := range td[4].Descendants(Ahref).All() { + attrName := *attrLink.Text() + if attr := attrSet[attrName]; attr != nil { + elem.Attributes = append(elem.Attributes, attr) + } + } + elemSet[elem.Name] = elem + } + } + return &ElementTable{ + Set: elemSet, + Skip: ignoreList, + } +} diff --git a/gen/util.go b/gen/util.go new file mode 100644 index 0000000..e0acc6d --- /dev/null +++ b/gen/util.go @@ -0,0 +1,19 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "log" +) + +func c(err error) { + if err != nil { + log.Fatal(err) + } +} + +func p(v ...interface{}) { + log.Println(v...) +} diff --git a/node.go b/node.go new file mode 100644 index 0000000..538a5e1 --- /dev/null +++ b/node.go @@ -0,0 +1,269 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package query + +import ( + "bytes" + "code.google.com/p/go.net/html" + . "github.com/hailiang/html-query/expr" + "io" + "regexp" + "strings" +) + +// Node represents a HTML node. +// Wrap html.Node so that chainable interface is possible +// Use pointer of it because we want to test with nil. +type Node struct { + n html.Node +} + +func NewNode(n *html.Node) *Node { + if n == nil { + return nil + } + return &Node{*n} +} + +func Parse(r io.Reader) (*Node, error) { + n, err := html.Parse(r) + if err != nil { + return nil, err + } + return NewNode(n), nil +} + +func (n *Node) InternalNode() *html.Node { + return &n.n +} + +func (n *Node) Attr(key string, pat ...string) *string { + if n == nil { + return nil + } + return GetAttrSubmatch(n.InternalNode(), key, GetPat(pat)) +} + +/* +func (n *Node) AttrSubmatch(key, pat string) *string { + if n == nil { + return nil + } + return GetAttrSubmatch(n.InternalNode(), key, pat) +} +*/ + +func (n *Node) getAttr(key string, pat ...string) *string { + return n.Attr(key, pat...) +} + +func (n *Node) Text(pat ...string) *string { + if n == nil { + return nil + } + return GetSubmatch(GetText(&n.n), GetPat(pat)) +} + +func (n *Node) AllText(pat ...string) *string { + ss := []string{} + for _, n := range n.Descendants(TextNode).All() { + if text := n.Text(pat...); text != nil && *text != "" { + ss = append(ss, *text) + } + } + s := strings.Join(ss, " ") + if s != "" { + return &s + } + return nil +} + +func (n *Node) PlainText() string { + if n == nil { + return "" + } + var buf bytes.Buffer + for _, s := range n.Descendants(TextNode).Strings(GetText) { + buf.WriteString(s) + } + return buf.String() +} + +func (n *Node) Render() *string { + if n == nil { + return nil + } + var b bytes.Buffer + err := html.Render(&b, &n.n) + if err != nil { + return nil + } + s := b.String() + return &s +} + +func (n *Node) RenderTagOnly() *string { + if n == nil { + return nil + } + var b bytes.Buffer + + if n.n.Type == html.ElementNode { + err := renderOpeningTag(&b, &n.n) + if err != nil { + return nil + } + } else { + err := renderSimpleNode(&b, &n.n) + if err != nil { + return nil + } + } + s := b.String() + return &s + +} + +func (n *Node) RenderChildren() *string { + if n == nil { + return nil + } + var b bytes.Buffer + node := FirstChild(&n.n) + for node != nil { + err := html.Render(&b, node) + if err != nil { + return nil + } + node = node.NextSibling + } + s := b.String() + return &s +} + +type TextNodeNode struct { + Node + rx *regexp.Regexp +} + +func NewTextNodeNode(n *Node, rx *regexp.Regexp) *TextNodeNode { + if n == nil { + return nil + } + return &TextNodeNode{*n, rx} +} + +func (n *TextNodeNode) Submatch() *string { + val := n.Text() + if val == nil { + return nil + } + m := n.rx.FindStringSubmatch(*val) + if m == nil || len(m) < 2 { + return nil + } + return &m[1] +} + +type NodeIter struct { + Iter +} + +func (i NodeIter) find(c Checker, cs []Checker) NodeIter { + return NodeIter{IterIter(i.Iter, also(c, cs)...)} +} + +func (i NodeIter) Find(cs ...Checker) NodeIter { + return NodeIter{IterIter(i.Iter, cs...)} +} + +func (i NodeIter) For(visit func(n *Node)) { + for n := i.Next(); n != nil; n = i.Next() { + visit(n) + } +} + +func (i NodeIter) Next() *Node { + next := i.Iter + if next == nil { + return nil + } + if node := next(); node != nil { + return NewNode(node) + } + return nil +} + +func (i NodeIter) All() (nodes []*Node) { + next := i.Iter + for node := next(); node != nil; node = next() { + nodes = append(nodes, NewNode(node)) + } + return +} + +func (i NodeIter) Strings(f StringGetter, pat ...string) []string { + if i.Iter == nil { + return nil + } + return Strings(i.Iter, f, pat...) +} + +func (i NodeIter) Integers(f StringGetter) []int { + if i.Iter == nil { + return nil + } + return Integers(i.Iter, f) +} + +type NodeStack struct { + *Stack +} + +/* +func (s NodeStack) All() (nodes []*Node) { + for _, node := range s.Stack.s { + nodes = append(nodes, NewNode(node)) + } + return +} +*/ + +// --------------------------------- +// If needed, autogenerate these routines + +// node methods + +func (i NodeIter) A(cs ...Checker) NodeIter { + return i.find(A, cs) +} + +func (i NodeIter) H2(cs ...Checker) NodeIter { + return i.find(H2, cs) +} + +func (i NodeIter) H3(cs ...Checker) NodeIter { + return i.find(H3, cs) +} +func (i NodeIter) H4(cs ...Checker) NodeIter { + return i.find(H4, cs) +} + +func (i NodeIter) Div(cs ...Checker) NodeIter { + return i.find(Div, cs) +} + +func (i NodeIter) Td(cs ...Checker) NodeIter { + return i.find(Td, cs) +} + +// attr methods + +func (i NodeIter) Href(pat ...string) []string { + if i.Iter == nil { + return nil + } + return Strings(i.Iter, GetHref, pat...) +} diff --git a/pretty.go b/pretty.go new file mode 100644 index 0000000..b9b60b7 --- /dev/null +++ b/pretty.go @@ -0,0 +1,365 @@ +// The code in this file is copied and modified from +// http://code.google.com/p/go.net. + +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file under +// http://code.google.com/p/go.net. + +package query + +import ( + "bufio" + "code.google.com/p/go.net/html" + "errors" + "fmt" + "io" + "os" + "strings" +) + +type writer interface { + io.Writer + WriteByte(c byte) error // in Go 1.1, use io.ByteWriter + WriteString(string) (int, error) +} + +func (n *Node) PrettyPrint() { + n.PrettyRender(os.Stdout, 4) +} + +// PrettyRender renders prettily the parse tree n to the given writer, +// for easily viewing as plain text. +func (n *Node) PrettyRender(w io.Writer, indentSize int) error { + if n == nil { + return nil + } + if x, ok := w.(writer); ok { + return render(x, &n.n, indentSize) + } + buf := bufio.NewWriter(w) + if err := render(buf, &n.n, indentSize); err != nil { + return err + } + return buf.Flush() +} + +// plaintextAbort is returned from render1 when a element +// has been rendered. No more end tags should be rendered after that. +var plaintextAbort = errors.New("html: internal error (plaintext abort)") + +func render(w writer, n *html.Node, size int) error { + err := render1(w, n, -1, size) + if err == plaintextAbort { + err = nil + } + return err +} + +func render1(w writer, n *html.Node, level, size int) error { + if !isSpace(n) && n.Type != html.DocumentNode && n.Type != html.DoctypeNode { + if err := writeBreak(w); err != nil { + return err + } + if err := writeIndent(w, level, size); err != nil { + return err + } + } + + if err := renderSimpleNode(w, n); err != nil { + return err + } + + switch n.Type { + case html.DocumentNode: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c, level+1, size); err != nil { + return err + } + } + return nil + case html.ElementNode: + return renderElementNode(w, n, level, size) + } + return nil +} + +func renderSimpleNode(w writer, n *html.Node) error { + // Render non-element nodes; these are the easy cases. + switch n.Type { + case html.ErrorNode: + return errors.New("html: cannot render an html.ErrorNode node") + case html.TextNode: + return escape(w, n.Data) + case html.CommentNode: + if _, err := w.WriteString("<!--"); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + if _, err := w.WriteString("-->"); err != nil { + return err + } + return nil + case html.DoctypeNode: + if _, err := w.WriteString("<!DOCTYPE "); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + if n.Attr != nil { + var p, s string + for _, a := range n.Attr { + switch a.Key { + case "public": + p = a.Val + case "system": + s = a.Val + } + } + if p != "" { + if _, err := w.WriteString(" PUBLIC "); err != nil { + return err + } + if err := writeQuoted(w, p); err != nil { + return err + } + if s != "" { + if err := w.WriteByte(' '); err != nil { + return err + } + if err := writeQuoted(w, s); err != nil { + return err + } + } + } else if s != "" { + if _, err := w.WriteString(" SYSTEM "); err != nil { + return err + } + if err := writeQuoted(w, s); err != nil { + return err + } + } + } + if err := w.WriteByte('>'); err != nil { + return err + } + return nil + case html.ElementNode, html.DocumentNode: + // No-op. + default: + return errors.New("html: unknown node type") + } + return nil +} + +func renderElementNode(w writer, n *html.Node, level, size int) error { + if err := renderOpeningTag(w, n); err != nil { + return err + } + + // Add initial newline where there is danger of a newline beging ignored. + if c := n.FirstChild; c != nil && c.Type == html.TextNode && strings.HasPrefix(c.Data, "\n") { + switch n.Data { + case "pre", "listing", "textarea": + if err := w.WriteByte('\n'); err != nil { + return err + } + } + } + + // Render any child nodes. + switch n.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp": + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + if _, err := w.WriteString(c.Data); err != nil { + return err + } + } else { + if err := render1(w, c, level+1, size); err != nil { + return err + } + } + } + if n.Data == "plaintext" { + // Don't render anything else. <plaintext> must be the + // last element in the file, with no closing tag. + return plaintextAbort + } + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c, level+1, size); err != nil { + return err + } + } + } + + // Render the </xxx> closing tag. + if err := writeBreak(w); err != nil { + return err + } + if err := writeIndent(w, level, size); err != nil { + return err + } + if _, err := w.WriteString("</"); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + if err := w.WriteByte('>'); err != nil { + return err + } + return nil +} + +// writeQuoted writes s to w surrounded by quotes. Normally it will use double +// quotes, but if s contains a double quote, it will use single quotes. +// It is used for writing the identifiers in a doctype declaration. +// In valid HTML, they can't contain both types of quotes. +func writeQuoted(w writer, s string) error { + var q byte = '"' + if strings.Contains(s, `"`) { + q = '\'' + } + if err := w.WriteByte(q); err != nil { + return err + } + if _, err := w.WriteString(s); err != nil { + return err + } + if err := w.WriteByte(q); err != nil { + return err + } + return nil +} + +// Section 12.1.2, "Elements", gives this list of void elements. Void elements +// are those that can't have any contents. +var voidElements = map[string]bool{ + "area": true, + "base": true, + "br": true, + "col": true, + "command": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "keygen": true, + "link": true, + "meta": true, + "param": true, + "source": true, + "track": true, + "wbr": true, +} + +const escapedChars = "&'<>\"\r" + +func escape(w writer, s string) error { + s = strings.TrimSpace(s) + + i := strings.IndexAny(s, escapedChars) + for i != -1 { + if _, err := w.WriteString(s[:i]); err != nil { + return err + } + var esc string + switch s[i] { + case '&': + esc = "&amp;" + case '\'': + // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5. + esc = "&#39;" + case '<': + esc = "&lt;" + case '>': + esc = "&gt;" + case '"': + // "&#34;" is shorter than "&quot;". + esc = "&#34;" + case '\r': + esc = "&#13;" + default: + panic("unrecognized escape character") + } + s = s[i+1:] + if _, err := w.WriteString(esc); err != nil { + return err + } + i = strings.IndexAny(s, escapedChars) + } + _, err := w.WriteString(s) + return err +} + +func writeIndent(w writer, level, size int) error { + for i := 0; i < level*size; i++ { + if _, err := w.WriteString(` `); err != nil { + return err + } + } + return nil +} + +func writeBreak(w writer) error { + _, err := w.Write([]byte{'\n'}) + return err +} + +func isSpace(n *html.Node) bool { + return n != nil && n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" +} + +func renderOpeningTag(w writer, n *html.Node) error { + // Render the <xxx> opening tag. + if err := w.WriteByte('<'); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + for _, a := range n.Attr { + if err := w.WriteByte(' '); err != nil { + return err + } + if a.Namespace != "" { + if _, err := w.WriteString(a.Namespace); err != nil { + return err + } + if err := w.WriteByte(':'); err != nil { + return err + } + } + if _, err := w.WriteString(a.Key); err != nil { + return err + } + if _, err := w.WriteString(`="`); err != nil { + return err + } + if err := escape(w, a.Val); err != nil { + return err + } + if err := w.WriteByte('"'); err != nil { + return err + } + } + if voidElements[n.Data] { + if n.FirstChild != nil { + return fmt.Errorf("html: void element <%s> has child nodes", n.Data) + } + _, err := w.WriteString("/>") + if err != nil { + return err + } + return nil + } + if err := w.WriteByte('>'); err != nil { + return err + } + return nil +} diff --git a/util.go b/util.go new file mode 100644 index 0000000..925bf6f --- /dev/null +++ b/util.go @@ -0,0 +1,13 @@ +// Copyright 2014, Hǎiliàng Wáng. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package query + +import ( + "fmt" +) + +func p(v ...interface{}) { + fmt.Println(v...) +}