Skip to content

Commit

Permalink
示例代码
Browse files Browse the repository at this point in the history
  • Loading branch information
wangcc0909 committed May 21, 2018
0 parents commit 5bca045
Show file tree
Hide file tree
Showing 105 changed files with 2,928 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .idea/Learngo.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions abx.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1
2
3
Binary file added bin/gopm.exe
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/engine.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/fetcher.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/forntend/controller.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/forntend/model.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/forntend/view.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/model.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/persist.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/simple.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler/zhenai/parser.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler_distributed/config.a
Binary file not shown.
Binary file added pkg/windows_amd64/crawler_distributed/persist.a
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added pkg/windows_amd64/crawler_distributed/worker.a
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/net/context.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/net/html.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/net/html/atom.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/net/html/charset.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/text/encoding.a
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/text/language.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/text/runes.a
Binary file not shown.
Binary file added pkg/windows_amd64/golang.org/x/text/transform.a
Binary file not shown.
Binary file added pkg/windows_amd64/gopkg.in/olivere/elastic.v3.a
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added pkg/windows_amd64/interfaces/mock.a
Binary file not shown.
Binary file added pkg/windows_amd64/interfaces/real.a
Binary file not shown.
Binary file added pkg/windows_amd64/rpc.a
Binary file not shown.
81 changes: 81 additions & 0 deletions src/crawler/engine/current.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package engine

import (
"crawler/model"
)

type Processor func(Request) (ParserResult,error)
type CurrentEngine struct {
Scheduler Scheduler
WorkerCount int
ItemServer chan model.Item
Processor Processor
}

type Scheduler interface {
Submit(Request)
WorkMaskChan() chan Request
WorkerReady(chan Request)
Run()
}

//注意代码顺序问题 worker 必须是在创建完之后才能使用 所以先走Run
func (c *CurrentEngine) Run(seeds ...Request) {
c.Scheduler.Run()

out := make(chan ParserResult)
for i := 0; i < c.WorkerCount; i++ {
c.createWorker(c.Scheduler.WorkMaskChan(), out, c.Scheduler)
}

for _, r := range seeds {
if isDuplicate(r.Url){
continue
}
c.Scheduler.Submit(r)
}

for {
result := <-out
for _, item := range result.Items {
go func() { c.ItemServer <- item }()
}

//这里去重
for _, p := range result.Requests {
if isDuplicate(p.Url){
continue
}
c.Scheduler.Submit(p)
}
}

}

func (c *CurrentEngine) createWorker(in chan Request, out chan ParserResult, s Scheduler) {
//一般网络请求是并发的 所有用go

go func() {

for {
s.WorkerReady(in)
result := <-in
parserResult, err := c.Processor(result)
if err != nil {
continue
}

out <- parserResult
}
}()
}

var isDupUrls = make(map[string]bool)
func isDuplicate(url string) bool {
if isDupUrls[url] {
return true
}

isDupUrls[url] = true
return false
}
36 changes: 36 additions & 0 deletions src/crawler/engine/simple.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package engine

import (
"log"
)

type SimpleEngine struct {
}

func (s SimpleEngine) Run(seeds ...Request) {
var requests []Request

for _,r := range seeds{
requests = append(requests, r)
}

for len(requests) > 0 {
r := requests[0]
requests = requests[1:]

result,err := Worker(r)
if err != nil {
continue
}

requests = append(requests,result.Requests...)

for _,item := range result.Items{
log.Printf("get item : %v\n",item)
}

}

}


54 changes: 54 additions & 0 deletions src/crawler/engine/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package engine

import "crawler/model"

type ParserFunc func([]byte,string) ParserResult

type Parser interface {
Parse([]byte,string) ParserResult
Name() (name string,args interface{})
}

type Request struct {
Url string
Parser Parser
}

type ParserResult struct {
Requests []Request
Items []model.Item
}

type NilParser struct {

}

func (NilParser) Parse([]byte, string) ParserResult {
return ParserResult{}
}

func (NilParser) Name() (name string, args interface{}) {
return "NilParser", nil
}

type FuncParser struct {
ParserFunc ParserFunc
name string
}

func NewFuncParser(parse ParserFunc,name string) *FuncParser{
return &FuncParser{
ParserFunc:parse,
name:name,
}
}

func (f *FuncParser) Parse(content []byte,url string) ParserResult {
return f.ParserFunc(content,url)
}

func (f *FuncParser) Name() (name string, args interface{}) {
return f.name,nil
}


19 changes: 19 additions & 0 deletions src/crawler/engine/worker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package engine

import (
"log"
"crawler/fetcher"
)

func Worker(r Request) (ParserResult,error) {

log.Printf("fetcher url %s",r.Url)
contents,err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("fetcher code error %s",err)
return ParserResult{},err
}

//这里解析是用的取出来的Request的解析器
return r.Parser.Parse(contents,r.Url),nil
}
52 changes: 52 additions & 0 deletions src/crawler/fetcher/fetcher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package fetcher

import (
"net/http"
"bufio"
"golang.org/x/text/transform"
"io/ioutil"
"golang.org/x/text/encoding"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding/unicode"
"github.com/gpmgo/gopm/modules/log"
"fmt"
"time"
"crawler_distributed/config"
)

var rateLimit = time.Tick(time.Second / config.RateQps)
func Fetch(url string) ([]byte,error) {
<- rateLimit
resp,err := http.Get(url)
if err != nil {
return nil,err
}

defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {

return nil,fmt.Errorf("wrong status code : %d",
resp.StatusCode)
}

bodyReader := bufio.NewReader(resp.Body)

e := determineEncoding(bodyReader)

utf8Reader := transform.NewReader(bodyReader,e.NewDecoder())

return ioutil.ReadAll(utf8Reader)
}

func determineEncoding(r *bufio.Reader) encoding.Encoding {

bytes,err := r.Peek(1024)
if err != nil {
log.Error("fetcher encoding error : %s",err)
return unicode.UTF8
}

e,_,_ := charset.DetermineEncoding(bytes,"")
return e
}
95 changes: 95 additions & 0 deletions src/crawler/forntend/controller/searchresult.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package controller

import (
"crawler/forntend/view"
"gopkg.in/olivere/elastic.v3"
"crawler/forntend/model"
"golang.org/x/net/context"
"reflect"
model2 "crawler/model"
"net/http"
"regexp"
"strings"
"strconv"
"crawler_distributed/config"
)

type SearchResultHandle struct {
View view.SearchResultView
Client *elastic.Client
}

func (s SearchResultHandle) ServeHTTP(w http.ResponseWriter, req *http.Request) {
q := strings.TrimSpace(req.FormValue("q"))

from, err := strconv.Atoi(req.FormValue("from"))
if err != nil {
from = 0
}

page, err := s.getSearchResult(q, from)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}

err = s.View.Render(w, page)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
}

func CreateSearchResultHandle(fileName string) SearchResultHandle {

client, err := elastic.NewClient(
elastic.SetSniff(false),
elastic.SetURL("http://192.168.99.100:9200"),
)

if err != nil {
panic(err)
}

return SearchResultHandle{
View: view.CreateSearchResultView(fileName),
Client: client,
}
}

const PageSize = 10

func (s SearchResultHandle) getSearchResult(q string, from int) (model.SearchResult, error) {

var result model.SearchResult
result.Query = q

resp, err := s.Client.Search(config.RpcIndex).
Query(elastic.NewQueryStringQuery(
rewriteQueryString(q))).
From(from).
DoC(context.Background())

if err != nil {
return result, err
}

result.Hint = resp.TotalHits()
result.Start = from
result.Items = resp.Each(reflect.TypeOf(model2.Item{}))

if result.Start == 0 {
result.PreFrom = -1
} else {
result.PreFrom = (result.Start - 1) / PageSize * PageSize
}

result.NextFrom = result.Start + len(result.Items)

return result, nil
}

func rewriteQueryString(q string) string {
re := regexp.MustCompile(`([A-Z][a-z]*):`)
return re.ReplaceAllString(q, "Profile.$1:")
}
Loading

0 comments on commit 5bca045

Please sign in to comment.