-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 5bca045
Showing
105 changed files
with
2,928 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
1 | ||
2 | ||
3 |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package engine | ||
|
||
import ( | ||
"crawler/model" | ||
) | ||
|
||
type Processor func(Request) (ParserResult,error) | ||
type CurrentEngine struct { | ||
Scheduler Scheduler | ||
WorkerCount int | ||
ItemServer chan model.Item | ||
Processor Processor | ||
} | ||
|
||
type Scheduler interface { | ||
Submit(Request) | ||
WorkMaskChan() chan Request | ||
WorkerReady(chan Request) | ||
Run() | ||
} | ||
|
||
//注意代码顺序问题 worker 必须是在创建完之后才能使用 所以先走Run | ||
func (c *CurrentEngine) Run(seeds ...Request) { | ||
c.Scheduler.Run() | ||
|
||
out := make(chan ParserResult) | ||
for i := 0; i < c.WorkerCount; i++ { | ||
c.createWorker(c.Scheduler.WorkMaskChan(), out, c.Scheduler) | ||
} | ||
|
||
for _, r := range seeds { | ||
if isDuplicate(r.Url){ | ||
continue | ||
} | ||
c.Scheduler.Submit(r) | ||
} | ||
|
||
for { | ||
result := <-out | ||
for _, item := range result.Items { | ||
go func() { c.ItemServer <- item }() | ||
} | ||
|
||
//这里去重 | ||
for _, p := range result.Requests { | ||
if isDuplicate(p.Url){ | ||
continue | ||
} | ||
c.Scheduler.Submit(p) | ||
} | ||
} | ||
|
||
} | ||
|
||
func (c *CurrentEngine) createWorker(in chan Request, out chan ParserResult, s Scheduler) { | ||
//一般网络请求是并发的 所有用go | ||
|
||
go func() { | ||
|
||
for { | ||
s.WorkerReady(in) | ||
result := <-in | ||
parserResult, err := c.Processor(result) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
out <- parserResult | ||
} | ||
}() | ||
} | ||
|
||
var isDupUrls = make(map[string]bool) | ||
func isDuplicate(url string) bool { | ||
if isDupUrls[url] { | ||
return true | ||
} | ||
|
||
isDupUrls[url] = true | ||
return false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package engine | ||
|
||
import ( | ||
"log" | ||
) | ||
|
||
type SimpleEngine struct { | ||
} | ||
|
||
func (s SimpleEngine) Run(seeds ...Request) { | ||
var requests []Request | ||
|
||
for _,r := range seeds{ | ||
requests = append(requests, r) | ||
} | ||
|
||
for len(requests) > 0 { | ||
r := requests[0] | ||
requests = requests[1:] | ||
|
||
result,err := Worker(r) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
requests = append(requests,result.Requests...) | ||
|
||
for _,item := range result.Items{ | ||
log.Printf("get item : %v\n",item) | ||
} | ||
|
||
} | ||
|
||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package engine | ||
|
||
import "crawler/model" | ||
|
||
type ParserFunc func([]byte,string) ParserResult | ||
|
||
type Parser interface { | ||
Parse([]byte,string) ParserResult | ||
Name() (name string,args interface{}) | ||
} | ||
|
||
type Request struct { | ||
Url string | ||
Parser Parser | ||
} | ||
|
||
type ParserResult struct { | ||
Requests []Request | ||
Items []model.Item | ||
} | ||
|
||
type NilParser struct { | ||
|
||
} | ||
|
||
func (NilParser) Parse([]byte, string) ParserResult { | ||
return ParserResult{} | ||
} | ||
|
||
func (NilParser) Name() (name string, args interface{}) { | ||
return "NilParser", nil | ||
} | ||
|
||
type FuncParser struct { | ||
ParserFunc ParserFunc | ||
name string | ||
} | ||
|
||
func NewFuncParser(parse ParserFunc,name string) *FuncParser{ | ||
return &FuncParser{ | ||
ParserFunc:parse, | ||
name:name, | ||
} | ||
} | ||
|
||
func (f *FuncParser) Parse(content []byte,url string) ParserResult { | ||
return f.ParserFunc(content,url) | ||
} | ||
|
||
func (f *FuncParser) Name() (name string, args interface{}) { | ||
return f.name,nil | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package engine | ||
|
||
import ( | ||
"log" | ||
"crawler/fetcher" | ||
) | ||
|
||
func Worker(r Request) (ParserResult,error) { | ||
|
||
log.Printf("fetcher url %s",r.Url) | ||
contents,err := fetcher.Fetch(r.Url) | ||
if err != nil { | ||
log.Printf("fetcher code error %s",err) | ||
return ParserResult{},err | ||
} | ||
|
||
//这里解析是用的取出来的Request的解析器 | ||
return r.Parser.Parse(contents,r.Url),nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package fetcher | ||
|
||
import ( | ||
"net/http" | ||
"bufio" | ||
"golang.org/x/text/transform" | ||
"io/ioutil" | ||
"golang.org/x/text/encoding" | ||
"golang.org/x/net/html/charset" | ||
"golang.org/x/text/encoding/unicode" | ||
"github.com/gpmgo/gopm/modules/log" | ||
"fmt" | ||
"time" | ||
"crawler_distributed/config" | ||
) | ||
|
||
var rateLimit = time.Tick(time.Second / config.RateQps) | ||
func Fetch(url string) ([]byte,error) { | ||
<- rateLimit | ||
resp,err := http.Get(url) | ||
if err != nil { | ||
return nil,err | ||
} | ||
|
||
defer resp.Body.Close() | ||
|
||
if resp.StatusCode != http.StatusOK { | ||
|
||
return nil,fmt.Errorf("wrong status code : %d", | ||
resp.StatusCode) | ||
} | ||
|
||
bodyReader := bufio.NewReader(resp.Body) | ||
|
||
e := determineEncoding(bodyReader) | ||
|
||
utf8Reader := transform.NewReader(bodyReader,e.NewDecoder()) | ||
|
||
return ioutil.ReadAll(utf8Reader) | ||
} | ||
|
||
func determineEncoding(r *bufio.Reader) encoding.Encoding { | ||
|
||
bytes,err := r.Peek(1024) | ||
if err != nil { | ||
log.Error("fetcher encoding error : %s",err) | ||
return unicode.UTF8 | ||
} | ||
|
||
e,_,_ := charset.DetermineEncoding(bytes,"") | ||
return e | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
package controller | ||
|
||
import ( | ||
"crawler/forntend/view" | ||
"gopkg.in/olivere/elastic.v3" | ||
"crawler/forntend/model" | ||
"golang.org/x/net/context" | ||
"reflect" | ||
model2 "crawler/model" | ||
"net/http" | ||
"regexp" | ||
"strings" | ||
"strconv" | ||
"crawler_distributed/config" | ||
) | ||
|
||
type SearchResultHandle struct { | ||
View view.SearchResultView | ||
Client *elastic.Client | ||
} | ||
|
||
func (s SearchResultHandle) ServeHTTP(w http.ResponseWriter, req *http.Request) { | ||
q := strings.TrimSpace(req.FormValue("q")) | ||
|
||
from, err := strconv.Atoi(req.FormValue("from")) | ||
if err != nil { | ||
from = 0 | ||
} | ||
|
||
page, err := s.getSearchResult(q, from) | ||
if err != nil { | ||
http.Error(w, err.Error(), http.StatusBadRequest) | ||
return | ||
} | ||
|
||
err = s.View.Render(w, page) | ||
if err != nil { | ||
http.Error(w, err.Error(), http.StatusBadRequest) | ||
return | ||
} | ||
} | ||
|
||
func CreateSearchResultHandle(fileName string) SearchResultHandle { | ||
|
||
client, err := elastic.NewClient( | ||
elastic.SetSniff(false), | ||
elastic.SetURL("http://192.168.99.100:9200"), | ||
) | ||
|
||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
return SearchResultHandle{ | ||
View: view.CreateSearchResultView(fileName), | ||
Client: client, | ||
} | ||
} | ||
|
||
const PageSize = 10 | ||
|
||
func (s SearchResultHandle) getSearchResult(q string, from int) (model.SearchResult, error) { | ||
|
||
var result model.SearchResult | ||
result.Query = q | ||
|
||
resp, err := s.Client.Search(config.RpcIndex). | ||
Query(elastic.NewQueryStringQuery( | ||
rewriteQueryString(q))). | ||
From(from). | ||
DoC(context.Background()) | ||
|
||
if err != nil { | ||
return result, err | ||
} | ||
|
||
result.Hint = resp.TotalHits() | ||
result.Start = from | ||
result.Items = resp.Each(reflect.TypeOf(model2.Item{})) | ||
|
||
if result.Start == 0 { | ||
result.PreFrom = -1 | ||
} else { | ||
result.PreFrom = (result.Start - 1) / PageSize * PageSize | ||
} | ||
|
||
result.NextFrom = result.Start + len(result.Items) | ||
|
||
return result, nil | ||
} | ||
|
||
func rewriteQueryString(q string) string { | ||
re := regexp.MustCompile(`([A-Z][a-z]*):`) | ||
return re.ReplaceAllString(q, "Profile.$1:") | ||
} |
Oops, something went wrong.