From 585b64b355ac4a1e50fb0c53c56a0c326a3276a0 Mon Sep 17 00:00:00 2001 From: Monzurul Islam Date: Fri, 30 Jun 2023 16:17:47 +0900 Subject: [PATCH] updated scrapper --- main.go | 215 +---------------- scrapper/google.go | 16 +- .../merriam-webster.go | 2 +- scrapper/thesaurus.go | 217 ++++++++++++++++++ 4 files changed, 236 insertions(+), 214 deletions(-) rename merriam-webster.go => scrapper/merriam-webster.go (99%) create mode 100644 scrapper/thesaurus.go diff --git a/main.go b/main.go index 530a4df..a83252a 100644 --- a/main.go +++ b/main.go @@ -1,18 +1,13 @@ package main import ( - "errors" "fmt" - "log" "math" "net/http" "os" "runtime" "strings" - "github.com/PuerkitoBio/goquery" - "github.com/geziyor/geziyor" - "github.com/geziyor/geziyor/client" "github.com/gin-contrib/cors" "github.com/gin-gonic/gin" "github.com/joho/godotenv" @@ -22,7 +17,8 @@ import ( func main() { err := godotenv.Load(".env") if err != nil { - log.Fatal("Error loading .env file") + fmt.Println("Error loading .env file") + // log.Fatal("Error loading .env file") } p := os.Getenv("PORT") @@ -106,7 +102,7 @@ func main() { }) r.GET("/w/:word", func(c *gin.Context) { - data, err := GetResult(c.Param("word")) + data, err := scrapper.GetResult(c.Param("word")) fmt.Println(data, err) @@ -144,7 +140,7 @@ func main() { r.GET("/mw/:word", func(c *gin.Context) { - data, err := GetMWData(c.Param("word")) + data, err := scrapper.GetMWData(c.Param("word")) fmt.Println(data, err) @@ -185,206 +181,3 @@ func main() { } -type WordResponse struct { - Synonyms []Synonym `json:"synonyms"` - Antonyms []string `json:"antonyms"` -} - -type Synonym struct { - PartsOfSpeech string `json:"parts_of_speech"` - Definition string `json:"definition"` - Syns []string `json:"synonym"` -} - -func GetResult(word string) (WordResponse, error) { - - var finalResult WordResponse - var err error - - // temp PoS and Def - tempPoS := []string{} - tempDef := []string{} - - geziyor.NewGeziyor(&geziyor.Options{ - // StartRequestsFunc: func(g *geziyor.Geziyor) { - // g.GetRendered("https://www.thesaurus.com/browse/"+word, g.Opt.ParseFunc) - // }, - StartURLs: []string{"https://www.thesaurus.com/browse/" + word}, - ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { - - if r.StatusCode != http.StatusOK { - fmt.Println("There was an error, ", r.Status) - err = fmt.Errorf("%s", r.Status) - } - - // fmt.Println(string(r.Body)) - - root := r.HTMLDoc.Find("[data-type='thesaurus-entry-module']") - - fmt.Println("roost") - fmt.Println(root.Length()) - - // find the parts of speech with definitions - tabList := root.Find("[data-type='thesaurus-entry-tablist']") - - fmt.Println(tabList.Length()) - - tabList.Find("li").Each(func(i int, s *goquery.Selection) { - fmt.Println(s.Text()) - whole := s.Text() - pos := s.Find("em").Text() - def := strings.TrimLeft(strings.ReplaceAll(whole, pos, ""), " ") - - tempPoS = append(tempPoS, pos) - tempDef = append(tempDef, def) - - fmt.Println(def) - fmt.Println(pos) - - }) - - singleGroup := []string{} - - card := root.Find("[data-type='thesaurus-synonyms-card']") - - card.Find("li").Each(func(i int, s *goquery.Selection) { - fmt.Println(s.Text()) - sn := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) - if len(sn) > 0 { - singleGroup = append(singleGroup, sn) - } - }) - - singleSynonymObj := Synonym{} - - if len(tempDef) > 0 { - singleSynonymObj.Definition = tempDef[0] - singleSynonymObj.PartsOfSpeech = tempPoS[0] - singleSynonymObj.Syns = singleGroup - finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) - - } - - // now find the antonyms - antonyms := []string{} - aCard := root.Find("[data-type='thesaurus-antonyms-card']") - fmt.Println(aCard.Length()) - aCard.Find("li").Each(func(i int, s *goquery.Selection) { - an := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) - - if len(an) > 0 { - antonyms = append(antonyms, an) - } - }) - finalResult.Antonyms = antonyms - }, - //BrowserEndpoint: "ws://localhost:3000", - }).Start() - - return finalResult, err - - // Request the HTML page. - res, err := http.Get("https://www.thesaurus.com/browse/" + word) - if err != nil { - log.Fatal(err) - } - - fmt.Println("=========body==========") - fmt.Println(res.Status) - - defer res.Body.Close() - if res.StatusCode != 200 { - - return finalResult, errors.New(res.Status) - // log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) - } - - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - return finalResult, err - // log.Fatal(err) - } - - container := doc.Filter(".wjLcgFJpqs9M6QJsPf5v") - - fmt.Println(container.Length()) - - // container := doc.Find(".MainContentContainer") - - // inside MainContentContainer - // first ul parts of speech with definition - // second ul synonyms - // and followed by more synonyms for parts of speech - // inside #antonyms the ul is the antonyms - - // check if definition is available or not - defs := container.Find(".ew5makj1") - // defs := container.Find("ul:first-child") - - if defs.Length() == 0 { - fmt.Println("No definition available") - return finalResult, nil - } - - // not get the parts of speech - defs.Each(func(i int, s *goquery.Selection) { - // find parts of speech - // fmt.Println("parts of speech", s.Find("em").Text()) - tempPoS = append(tempPoS, s.Find("em").Text()) - // fmt.Println("meaning", s.Find("strong").Text()) - tempDef = append(tempDef, s.Find("strong").Text()) - }) - - // now find the synonyms and antonyms - - // len := container.Find("ul.e1ccqdb60").Length() - // synonyms := container.Find("ul.e1ccqdb60").First().Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - // }) - - // synonyms := [][]string{} - singleSynonymObj := Synonym{} - - // check if second synonym is available - for i := 0; i < defs.Length(); i++ { - singleGroup := []string{} - container.Find("ul").Eq(i + 1).Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - sn := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) - if len(sn) > 0 { - singleGroup = append(singleGroup, sn) - } - - }) - singleSynonymObj.Definition = tempDef[i] - singleSynonymObj.PartsOfSpeech = tempPoS[i] - singleSynonymObj.Syns = singleGroup - - finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) - - // synonyms = append(synonyms, singleGroup) - } - - // fmt.Println(synonyms) - - antonyms := []string{} - - // find antonyms - container.Find("#antonyms ul").Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - // check string - an := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) - - if len(an) > 0 { - antonyms = append(antonyms, an) - } - - }) - - finalResult.Antonyms = antonyms - // fmt.Println(antonyms) - - return finalResult, nil - -} diff --git a/scrapper/google.go b/scrapper/google.go index 29fa20c..64c034b 100644 --- a/scrapper/google.go +++ b/scrapper/google.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "net/http" + "net/url" "os" "regexp" "strings" @@ -71,7 +72,18 @@ func GetGoogleResult(word string) (*WordStruct, int) { todaysKey := RoundRobinApiKey(API_KEY) // Request the HTML page. - res, err := http.Get(fmt.Sprintf("http://api.scraperapi.com/?api_key=%s&url=https://www.google.com/search?&hl=en&q=define+%s", todaysKey, word)) + encoded_url := url.QueryEscape(fmt.Sprintf("https://www.google.com/search?&hl=en&q=define+%s",word)) + url := fmt.Sprintf("https://api.scrape.do?token=%s&url=%s", todaysKey, encoded_url) + method := "GET" + client := &http.Client{} + req, err := http.NewRequest(method, url, nil) + if err != nil { + fmt.Println(err.Error()) + errorStatus = 400 + return &wordS, errorStatus + } + res, err := client.Do(req) + // res, err := http.Get(fmt.Sprintf("https://api.scrape.do?token=%s&url=https://www.google.com/search?&hl=en&q=define+%s", todaysKey, word)) if err != nil { fmt.Println(err.Error()) errorStatus = 400 @@ -125,7 +137,7 @@ func GetGoogleResult(word string) (*WordStruct, int) { // 4- origin // find the main word - mainWord := thirdJsSlot.Find(mainWordQueryTag).Text() + mainWord := strings.ReplaceAll(thirdJsSlot.Find(mainWordQueryTag).Text(),"ยท","") fmt.Println("main word", mainWord) // check if it has phonetics and audio in the #1 div diff --git a/merriam-webster.go b/scrapper/merriam-webster.go similarity index 99% rename from merriam-webster.go rename to scrapper/merriam-webster.go index dcbaa74..b70f7e0 100644 --- a/merriam-webster.go +++ b/scrapper/merriam-webster.go @@ -1,4 +1,4 @@ -package main +package scrapper import ( "errors" diff --git a/scrapper/thesaurus.go b/scrapper/thesaurus.go new file mode 100644 index 0000000..666a402 --- /dev/null +++ b/scrapper/thesaurus.go @@ -0,0 +1,217 @@ +package scrapper + +import ( + "errors" + "fmt" + "log" + "net/http" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/geziyor/geziyor" + "github.com/geziyor/geziyor/client" +) + +type WordResponse struct { + Synonyms []Synonym `json:"synonyms"` + Antonyms []string `json:"antonyms"` +} + +type Synonym struct { + PartsOfSpeech string `json:"parts_of_speech"` + Definition string `json:"definition"` + Syns []string `json:"synonym"` +} + +func GetResult(word string) (WordResponse, error) { + + var finalResult WordResponse + var err error + + // temp PoS and Def + tempPoS := []string{} + tempDef := []string{} + + geziyor.NewGeziyor(&geziyor.Options{ + // StartRequestsFunc: func(g *geziyor.Geziyor) { + // g.GetRendered("https://www.thesaurus.com/browse/"+word, g.Opt.ParseFunc) + // }, + StartURLs: []string{"https://www.thesaurus.com/browse/" + word}, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + + if r.StatusCode != http.StatusOK { + fmt.Println("There was an error, ", r.Status) + err = fmt.Errorf("%s", r.Status) + } + + // fmt.Println(string(r.Body)) + + root := r.HTMLDoc.Find("[data-type='thesaurus-entry-module']") + + fmt.Println("roost") + fmt.Println(root.Length()) + + // find the parts of speech with definitions + tabList := root.Find("[data-type='thesaurus-entry-tablist']") + + fmt.Println(tabList.Length()) + + tabList.Find("li").Each(func(i int, s *goquery.Selection) { + fmt.Println(s.Text()) + whole := s.Text() + pos := s.Find("em").Text() + def := strings.TrimLeft(strings.ReplaceAll(whole, pos, ""), " ") + + tempPoS = append(tempPoS, pos) + tempDef = append(tempDef, def) + + fmt.Println(def) + fmt.Println(pos) + + }) + + singleGroup := []string{} + + card := root.Find("[data-type='thesaurus-synonyms-card']") + + card.Find("li").Each(func(i int, s *goquery.Selection) { + fmt.Println(s.Text()) + sn := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) + if len(sn) > 0 { + singleGroup = append(singleGroup, sn) + } + }) + + singleSynonymObj := Synonym{} + + if len(tempDef) > 0 { + singleSynonymObj.Definition = tempDef[0] + singleSynonymObj.PartsOfSpeech = tempPoS[0] + singleSynonymObj.Syns = singleGroup + finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) + + } + + // now find the antonyms + antonyms := []string{} + aCard := root.Find("[data-type='thesaurus-antonyms-card']") + fmt.Println(aCard.Length()) + aCard.Find("li").Each(func(i int, s *goquery.Selection) { + an := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) + + if len(an) > 0 { + antonyms = append(antonyms, an) + } + }) + finalResult.Antonyms = antonyms + }, + //BrowserEndpoint: "ws://localhost:3000", + }).Start() + + return finalResult, err + + // Request the HTML page. + res, err := http.Get("https://www.thesaurus.com/browse/" + word) + if err != nil { + log.Fatal(err) + } + + fmt.Println("=========body==========") + fmt.Println(res.Status) + + defer res.Body.Close() + if res.StatusCode != 200 { + + return finalResult, errors.New(res.Status) + // log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return finalResult, err + // log.Fatal(err) + } + + container := doc.Filter(".wjLcgFJpqs9M6QJsPf5v") + + fmt.Println(container.Length()) + + // container := doc.Find(".MainContentContainer") + + // inside MainContentContainer + // first ul parts of speech with definition + // second ul synonyms + // and followed by more synonyms for parts of speech + // inside #antonyms the ul is the antonyms + + // check if definition is available or not + defs := container.Find(".ew5makj1") + // defs := container.Find("ul:first-child") + + if defs.Length() == 0 { + fmt.Println("No definition available") + return finalResult, nil + } + + // not get the parts of speech + defs.Each(func(i int, s *goquery.Selection) { + // find parts of speech + // fmt.Println("parts of speech", s.Find("em").Text()) + tempPoS = append(tempPoS, s.Find("em").Text()) + // fmt.Println("meaning", s.Find("strong").Text()) + tempDef = append(tempDef, s.Find("strong").Text()) + }) + + // now find the synonyms and antonyms + + // len := container.Find("ul.e1ccqdb60").Length() + // synonyms := container.Find("ul.e1ccqdb60").First().Find("li").Each(func(i int, s *goquery.Selection) { + // fmt.Println(s.Find("a").Text()) + // }) + + // synonyms := [][]string{} + singleSynonymObj := Synonym{} + + // check if second synonym is available + for i := 0; i < defs.Length(); i++ { + singleGroup := []string{} + container.Find("ul").Eq(i + 1).Find("li").Each(func(i int, s *goquery.Selection) { + // fmt.Println(s.Find("a").Text()) + sn := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) + if len(sn) > 0 { + singleGroup = append(singleGroup, sn) + } + + }) + singleSynonymObj.Definition = tempDef[i] + singleSynonymObj.PartsOfSpeech = tempPoS[i] + singleSynonymObj.Syns = singleGroup + + finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) + + // synonyms = append(synonyms, singleGroup) + } + + // fmt.Println(synonyms) + + antonyms := []string{} + + // find antonyms + container.Find("#antonyms ul").Find("li").Each(func(i int, s *goquery.Selection) { + // fmt.Println(s.Find("a").Text()) + // check string + an := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) + + if len(an) > 0 { + antonyms = append(antonyms, an) + } + + }) + + finalResult.Antonyms = antonyms + // fmt.Println(antonyms) + + return finalResult, nil + +}