From 95653338bbed868e2d22c8cf5b531c5a22540635 Mon Sep 17 00:00:00 2001 From: SoggyRihno <94922205+SoggyRihno@users.noreply.github.com> Date: Thu, 2 Oct 2025 18:41:29 -0500 Subject: [PATCH] added html for scrape profiles, scrapers output dir file structure --- parser/parser.go | 5 ++-- parser/parser_test.go | 59 ++++++++++++++++++++++++++++++++---------- scrapers/coursebook.go | 19 +++++++------- scrapers/profiles.go | 57 ++++++++++++++++++++++++++++++++-------- 4 files changed, 105 insertions(+), 35 deletions(-) diff --git a/parser/parser.go b/parser/parser.go index 82b1d08..9e8496d 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -4,6 +4,7 @@ import ( "fmt" "log" "os" + "path/filepath" "time" "github.com/UTDNebula/api-tools/utils" @@ -54,10 +55,10 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { } // Try to load any existing profile data - loadProfiles(inDir) + loadProfiles(filepath.Join(inDir, "profiles")) // Find paths of all scraped data - paths := utils.GetAllFilesWithExtension(inDir, ".html") + paths := utils.GetAllFilesWithExtension(filepath.Join(inDir, "coursebook"), ".html") if !skipValidation { log.Printf("Parsing and validating %d files...", len(paths)) } else { diff --git a/parser/parser_test.go b/parser/parser_test.go index 95ccf86..fdaf36a 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -132,14 +132,13 @@ func updateTestData() error { //doesn't do anything since there is no profile data loadProfiles("") - tempDir, err := os.MkdirTemp("", "testdata-*") + tempResultDir, err := os.MkdirTemp("", "testdata-*") if err != nil { - log.Fatal(err) + log.Fatalf("Failed to create temporary directory: %v", err) } - defer os.RemoveAll(tempDir) + defer os.RemoveAll(tempResultDir) //Fill temp dir with all the test cases and expected values - duplicates := make(map[string]bool) for i, input := range utils.GetAllFilesWithExtension("testdata", ".html") { @@ -188,7 +187,7 @@ func updateTestData() error { } classInfo := getClassInfo(doc) - caseDir := filepath.Join(tempDir, fmt.Sprintf("case_%03d", i)) + caseDir := filepath.Join(tempResultDir, fmt.Sprintf("case_%03d", i)) if err = os.Mkdir(caseDir, 0777); err != nil { return fmt.Errorf("failed to create directory: %v", err) } @@ -218,26 +217,53 @@ func updateTestData() error { clearGlobals() } - //rerun parser to get Courses.json, Sections.json, Professors.json - - //Parse(tempDir, tempDir, "../grade-data", false) - //Grade data isn't work with tests currently - Parse(tempDir, tempDir, "", false) + input, err := createSampleInput() + if err != nil { + return fmt.Errorf("failed to create sample input for Parse: %v", err) + } + defer os.RemoveAll(input) + Parse(input, tempResultDir, "", false) //overwrite the current test data with the new data if err := os.RemoveAll("testdata"); err != nil { return fmt.Errorf("failed to remove testdata: %v", err) } - if err := os.CopyFS("testdata", os.DirFS(tempDir)); err != nil { + if err := os.CopyFS("testdata", os.DirFS(tempResultDir)); err != nil { return fmt.Errorf("failed to copy testdata: %v", err) } - //reset maps to avoid side effects. maybe parser should be an object? clearGlobals() return nil } +func createSampleInput() (string, error) { + tempInputDir, err := os.MkdirTemp("", "input-*") + if err != nil { + log.Fatalf("Failed to create temporary input directory: %v", err) + } + + if err = os.Mkdir(filepath.Join(tempInputDir, "coursebook"), 0777); err != nil { + log.Fatalf("Failed to create course book directory in temp intput dir: %v", err) + } + // for future test data + if err = os.Mkdir(filepath.Join(tempInputDir, "profiles"), 0777); err != nil { + log.Fatalf("Failed to create profiles directory in temp intput dir: %v", err) + } + + for i, input := range utils.GetAllFilesWithExtension("testdata", ".html") { + data, err := os.ReadFile(input) + if err != nil { + return "", fmt.Errorf("failed to load test data: %v", err) + } + err = os.WriteFile(filepath.Join(tempInputDir, "coursebook", fmt.Sprintf("input%03d.html", i)), data, 0777) + if err != nil { + return "", fmt.Errorf("failed to write test data: %v", err) + } + } + return tempInputDir, nil +} + func clearGlobals() { Sections = make(map[primitive.ObjectID]*schema.Section) Courses = make(map[string]*schema.Course) @@ -249,8 +275,15 @@ func clearGlobals() { func TestParse(t *testing.T) { tempDir := t.TempDir() + + input, err := createSampleInput() + if err != nil { + t.Errorf("failed to create sample input for Parse: %v", err) + } + defer os.RemoveAll(input) + // todo fix grade data, csvPath = ./grade-data panics - Parse("testdata", tempDir, "", false) + Parse(input, tempDir, "", false) OutputCourses, err := unmarshallFile[[]schema.Course](filepath.Join(tempDir, "courses.json")) if err != nil { diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 4f6119c..0e8919c 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -30,6 +30,8 @@ var ( ) const ( + coursebookDir = "coursebook" + reqThrottle = 400 * time.Millisecond prefixThrottle = 5 * time.Second httpTimeout = 10 * time.Second @@ -153,7 +155,7 @@ func (s *coursebookScraper) lastCompletePrefix() string { log.Fatal(err) } - dir, err := os.ReadDir(filepath.Join(s.outDir, s.term)) + dir, err := os.ReadDir(filepath.Join(s.outDir, coursebookDir, s.term)) if err != nil { log.Fatalf("failed to read output directory: %v", err) } @@ -179,26 +181,25 @@ func (s *coursebookScraper) lastCompletePrefix() string { return "" } -// ensurePrefixFolder creates {outDir}/term if it does not exist - +// ensurePrefixFolder creates {outDir}/coursebookDir/term if it does not exist func (s *coursebookScraper) ensureOutputFolder() error { - if err := os.MkdirAll(filepath.Join(s.outDir, s.term), 0755); err != nil { + if err := os.MkdirAll(filepath.Join(s.outDir, coursebookDir, s.term), 0755); err != nil { return fmt.Errorf("failed to create term forlder: %w", err) } return nil } -// ensurePrefixFolder creates {outDir}/term/prefix if it does not exist +// ensurePrefixFolder creates {outDir}/coursebookDir/term/prefix if it does not exist func (s *coursebookScraper) ensurePrefixFolder(prefix string) error { - if err := os.MkdirAll(filepath.Join(s.outDir, s.term, prefix), 0755); err != nil { + if err := os.MkdirAll(filepath.Join(s.outDir, coursebookDir, s.term, prefix), 0755); err != nil { return fmt.Errorf("failed to create folder for %s: %w", prefix, err) } return nil } -// writeSection writes content to file {outDir}/term/prefix/{id}.html +// writeSection writes content to file {outDir}/coursebookDir/term/prefix/{id}.html func (s *coursebookScraper) writeSection(prefix string, id string, content string) error { - if err := os.WriteFile(filepath.Join(s.outDir, s.term, prefix, id+".html"), []byte(content), 0644); err != nil { + if err := os.WriteFile(filepath.Join(s.outDir, coursebookDir, s.term, prefix, id+".html"), []byte(content), 0644); err != nil { return fmt.Errorf("failed to write section %s: %w", id, err) } return nil @@ -219,7 +220,7 @@ func (s *coursebookScraper) getSectionContent(id string) (string, error) { // getMissingIdsForPrefix calls getSectionIdsForPrefix and filters out the ids that already // exist in the prefix directory func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, error) { - path := filepath.Join(s.outDir, s.term, prefix) + path := filepath.Join(s.outDir, coursebookDir, s.term, prefix) sectionIds, err := s.getSectionIdsForPrefix(prefix) if err != nil { diff --git a/scrapers/profiles.go b/scrapers/profiles.go index bbdfffc..660fc26 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -11,10 +11,13 @@ import ( "fmt" "log" "os" + "path/filepath" "regexp" "strconv" "strings" + "github.com/chromedp/cdproto/dom" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "github.com/chromedp/cdproto/cdp" @@ -23,10 +26,11 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" ) -const BASE_URL string = "https://profiles.utdallas.edu/browse?page=" +const BaseUrl string = "https://profiles.utdallas.edu/browse?page=" +const ProfilesDir string = "profiles" -var primaryLocationRegex *regexp.Regexp = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`) -var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile(`^([A-z]+)(\d+)\.?(\d{3}[A-z]?)$`) +var primaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`) +var fallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?(\d{3}[A-z]?)$`) func parseLocation(text string) schema.Location { var building string @@ -99,7 +103,7 @@ func getNodeText(node *cdp.Node) string { func scrapeProfessorLinks(chromedpCtx context.Context) []string { var pageLinks []*cdp.Node _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(BASE_URL+"1"), + chromedp.Navigate(BaseUrl+"1"), chromedp.QueryAfter(".page-link", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { pageLinks = nodes @@ -119,7 +123,7 @@ func scrapeProfessorLinks(chromedpCtx context.Context) []string { professorLinks := make([]string, 0, numPages) for curPage := 1; curPage <= numPages; curPage++ { _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(BASE_URL+strconv.Itoa(curPage)), + chromedp.Navigate(BaseUrl+strconv.Itoa(curPage)), chromedp.QueryAfter("//h5[@class='card-title profile-name']//a", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { for _, node := range nodes { @@ -146,8 +150,8 @@ func ScrapeProfiles(outDir string) { chromedpCtx, cancel := utils.InitChromeDp() defer cancel() - err := os.MkdirAll(outDir, 0777) - if err != nil { + resultDir := filepath.Join(outDir, ProfilesDir) + if err := os.MkdirAll(resultDir, 0777); err != nil { panic(err) } @@ -158,13 +162,24 @@ func ScrapeProfiles(outDir string) { log.Print("Scraped professor links!") for _, link := range professorLinks { + utils.VPrint("Scraping name...") - // Navigate to the link and get the names - var firstName, lastName string + html, err := getOuterHtml(chromedpCtx, link) + if err != nil { + log.Fatalf("Failed to scrape link %s: %v", link, err) + } - utils.VPrint("Scraping name...") + name := link[strings.LastIndex(link, "/"):] + if err = os.WriteFile(filepath.Join(resultDir, name+".html"), []byte(html), 0644); err != nil { + log.Fatalf("Failed save html for %s: %v", name, err) + return + } - _, err := chromedp.RunResponse(chromedpCtx, + /// Everything below should be moved to parser + + // Navigate to the link and get the names + var firstName, lastName string + _, err = chromedp.RunResponse(chromedpCtx, chromedp.Navigate(link), chromedp.ActionFunc(func(ctx context.Context) error { var text string @@ -301,3 +316,23 @@ func ScrapeProfiles(outDir string) { encoder.Encode(professors) fptr.Close() } + +func getOuterHtml(chromedpCtx context.Context, url string) (string, error) { + var html string + err := chromedp.Run(chromedpCtx, + chromedp.Navigate(url), + chromedp.ActionFunc(func(ctx context.Context) error { + node, err := dom.GetDocument().Do(ctx) + if err != nil { + return err + } + html, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx) + return err + }), + ) + + if err != nil { + return "", fmt.Errorf("failed to get outerHtml for page %s: %w", url, err) + } + return html, nil +}