Skip to content

Commit

Permalink
scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
suranisaunak committed Jun 27, 2023
1 parent 242528d commit 24818dc
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 0 deletions.
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module main

go 1.19

require github.com/gorilla/mux v1.8.0
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
121 changes: 121 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package main

import (
"encoding/json"
"io/ioutil"
"log"
"net/http"
"regexp"

"github.com/gorilla/mux"
)

func main() {
router := mux.NewRouter()
router.HandleFunc("/scrape", scrapeHandler).Methods("POST")
log.Fatal(http.ListenAndServe(":8000", router))
}

func scrapeHandler(w http.ResponseWriter, r *http.Request) {
// Read the URL from the request body
bodyBytes, err := ioutil.ReadAll(r.Body)
if err != nil {
http.Error(w, "Failed to read request body", http.StatusBadRequest)
return
}
defer r.Body.Close()

websiteURL := string(bodyBytes)

// Fetch the website content
resp, err := http.Get(websiteURL)
if err != nil {
http.Error(w, "Failed to fetch website content", http.StatusInternalServerError)
return
}
defer resp.Body.Close()

// Read the response body
bodyBytes, err = ioutil.ReadAll(resp.Body)
if err != nil {
http.Error(w, "Failed to read response body", http.StatusInternalServerError)
return
}
body := string(bodyBytes)

// Extract unique URLs from the website
urls := extractUniqueURLs(body)

// Extract unique emails from the website
emails := extractUniqueEmails(body)

// Create a response payload
response := struct {
URLs []string `json:"urls"`
Emails []string `json:"emails"`
}{
URLs: urls,
Emails: emails,
}

// Send the response as JSON
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(response)
if err != nil {
http.Error(w, "Failed to encode response", http.StatusInternalServerError)
return
}
}

func extractUniqueURLs(body string) []string {
// Define a regular expression pattern to match URLs
urlPattern := regexp.MustCompile(`(https?://\S+)`)

// Find all matches of URLs in the body
urlMatches := urlPattern.FindAllStringSubmatch(body, -1)

// Extract the URLs from the matches
var urls []string
for _, match := range urlMatches {
url := match[1]
urls = append(urls, url)
}

// Remove duplicate URLs
urls = removeDuplicates(urls)

return urls
}

func extractUniqueEmails(body string) []string {
// Define a regular expression pattern to match emails
emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)

// Find all matches of emails in the body
emailMatches := emailPattern.FindAllString(body, -1)

// Remove duplicate emails
emails := removeDuplicates(emailMatches)

return emails
}

func removeDuplicates(items []string) []string {
// Create a map to track unique items
uniqueMap := make(map[string]bool)

// Iterate over the items and add them to the map
for _, item := range items {
uniqueMap[item] = true
}

// Create a new slice to store the unique items
var uniqueItems []string

// Append the unique items to the new slice
for item := range uniqueMap {
uniqueItems = append(uniqueItems, item)
}

return uniqueItems
}

0 comments on commit 24818dc

Please sign in to comment.