From 77e2f1f7d787baaa761cb4bed965fc0391a7989f Mon Sep 17 00:00:00 2001
From: Ray <raiyan.ahmed@gmail.com>
Date: Thu, 4 Jan 2024 21:40:39 +1100
Subject: [PATCH] Can scrape hrefs

---
 src/lib.rs               |   6 +-
 src/main.rs              |  12 +--
 src/scraper.rs           | 185 +++++++++++++++++++++++++++------------
 src/url_invalid_error.rs |   1 -
 4 files changed, 137 insertions(+), 67 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 33a5a85..ddac5bb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,5 @@
-mod url_invalid_error;
 mod scraper;
+mod url_invalid_error;
 
-
-
+pub use scraper::Scraper;
 pub use url_invalid_error::UrlInvalidError;
-pub use scraper::Scraper;
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index 74e18e8..08d50a5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,14 +1,14 @@
 use spooderman::Scraper;
 
-
 #[tokio::main]
 async fn main() {
-    let mut scraper = Scraper::new()
-        .set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string());
-    
+    let mut scraper =
+        Scraper::new().set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string());
+
     match scraper.run_scraper().await {
-        Ok(_res) => {println!("Scraping successful!\n");
-    },
+        Ok(_res) => {
+            println!("Scraping successful!\n");
+        }
         Err(e) => eprintln!("Error: {}", e),
     }
 }
diff --git a/src/scraper.rs b/src/scraper.rs
index 01c7ada..3d38159 100644
--- a/src/scraper.rs
+++ b/src/scraper.rs
@@ -1,58 +1,82 @@
-use std::ops::Add;
 use chrono::{DateTime, Utc};
 use reqwest::ClientBuilder;
-use scraper::Html;
+use scraper::{html, ElementRef, Selector};
+use std::ops::Add;
 
 use crate::UrlInvalidError;
 
-
 #[derive(Debug)]
 enum Term {
-    T1, T2, T3, Summer
+    T1,
+    T2,
+    T3,
+    Summer,
 }
 
 #[derive(Debug)]
-enum Status { Open, Closed }
+enum Status {
+    Open,
+    Closed,
+}
 
 #[derive(Debug)]
-struct Enrolment { enrolled: u32, capacity: u32 }
+struct Enrolment {
+    enrolled: u32,
+    capacity: u32,
+}
 
 #[derive(Debug)]
-struct TimeBlock { start: (u32, u32), end: (u32, u32) }
+struct TimeBlock {
+    start: (u32, u32),
+    end: (u32, u32),
+}
 
 impl Add for TimeBlock {
     type Output = TimeBlock;
-    
+
     fn add(self, another: TimeBlock) -> Self {
         let add_hours = |a, b| (a + b) % 24;
         let add_minutes = |a, b| (a + b) % 60;
         Self {
-            start: (add_hours(self.start.0, another.start.0), add_minutes(self.start.1, another.start.1)),
-            end: (add_hours(self.end.0, another.end.0), add_minutes(self.end.1, another.end.1))
+            start: (
+                add_hours(self.start.0, another.start.0),
+                add_minutes(self.start.1, another.start.1),
+            ),
+            end: (
+                add_hours(self.end.0, another.end.0),
+                add_minutes(self.end.1, another.end.1),
+            ),
         }
     }
 }
 
 #[derive(Debug)]
-struct DateBlock { start: DateTime<Utc>, end: DateTime<Utc> }
-
+struct DateBlock {
+    start: DateTime<Utc>,
+    end: DateTime<Utc>,
+}
 
 #[derive(Debug)]
-enum Day { 
-  Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday
+enum Day {
+    Sunday,
+    Monday,
+    Tuesday,
+    Wednesday,
+    Thursday,
+    Friday,
+    Saturday,
 }
 
 #[derive(Debug)]
-struct ClassTimeBlock { 
-  day: Day, 
-  weeks: String, 
-  time: TimeBlock,
-  location: String,
+pub struct ClassTimeBlock {
+    day: Day,
+    weeks: String,
+    time: TimeBlock,
+    location: String,
 }
 
-
 #[derive(Debug)]
-struct Class {
+pub struct Class {
     class_id: u32,
     section: String,
     term: Term,
@@ -61,14 +85,18 @@ struct Class {
     course_enrolment: Enrolment,
     term_date: DateBlock,
     mode: String,
-    times: Vec<ClassTimeBlock>
+    times: Vec<ClassTimeBlock>,
 }
 
 #[derive(Debug)]
-enum Career { UG, PG, RESEARCH }
+enum Career {
+    UG,
+    PG,
+    RESEARCH,
+}
 
 #[derive(Debug)]
-struct Course {
+pub struct Course {
     code: String,
     name: String,
     campus: Career,
@@ -80,27 +108,25 @@ struct Course {
 }
 
 #[derive(Debug)]
-struct Page {
-  url: String,
-  subject_area_course_code: String,
-  subject_area_course_name: String,
-  school: String,
-  courses: Vec<Course>,
+pub struct Page {
+    url: String,
+    subject_area_course_code: String,
+    subject_area_course_name: String,
+    school: String,
+    courses: Vec<Course>,
 }
 
-
 #[derive(Debug)]
 pub struct Scraper {
     url: Option<String>,
     pages: Option<Vec<Page>>,
 }
 
-
 impl Scraper {
     pub fn new() -> Self {
         Scraper {
             url: None,
-            pages: None,
+            pages: Some(Vec::new()),
         }
     }
 
@@ -116,35 +142,82 @@ impl Scraper {
         self
     }
 
-
-  async fn fetch_url(&self, url: &str) -> Result<String, Box<dyn std::error::Error>> {
-      let client = ClientBuilder::new().danger_accept_invalid_certs(true).build()?;
-      let response = client.get(url).send().await?;
-      let body = response.text().await?;
-      Ok(body)
+    async fn fetch_url(&self, url: &str) -> Result<String, Box<dyn std::error::Error>> {
+        let client = ClientBuilder::new()
+            .danger_accept_invalid_certs(true)
+            .build()?;
+        let response = client.get(url).send().await?;
+        let body = response.text().await?;
+        Ok(body)
     }
 
-  pub async fn run_scraper(&mut self) -> Result<Html, Box<dyn std::error::Error>> {
-      match &self.url { 
-        Some(url) => {
-          let html = self.fetch_url(url).await?;
-          println!("{}", html);
-          let html_course_selector = scraper::Selector::parse("tr.rowLowlight td.data").unwrap();
-          let doc = scraper::Html::parse_document(&html);
-          let res: Vec<_> = doc.select(&html_course_selector).flat_map(|el| el.text()).collect();
-          println!("{:?}", res);
-          Ok(doc)
-        }
-        None => {
-          Err(Box::new(UrlInvalidError))
+    pub async fn run_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        match &self.url {
+            Some(url) => {
+                let html = self.fetch_url(url).await?;
+                println!("{}", html);
+                let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
+                let code_selector = Selector::parse("td.data").unwrap();
+                let name_selector = Selector::parse("td.data a").unwrap();
+                let link_selector = Selector::parse("td.data a").unwrap();
+                let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
+                let document = scraper::Html::parse_document(&html);
+                for row_node in document.select(&row_selector) {
+                    // Extract data from each row
+                    let subject_area_course_code =
+                        extract_text(row_node.select(&code_selector).next().unwrap());
+                    let subject_area_course_name =
+                        extract_text(row_node.select(&name_selector).next().unwrap());
+                    let url = get_html_link_to_page(
+                        row_node
+                            .select(&link_selector)
+                            .next()
+                            .map_or("", |node| node.value().attr("href").unwrap_or("")),
+                    );
+                    let school = extract_text(row_node.select(&school_selector).next().unwrap());
+                    // Create a Course struct and push it to the vector
+                    let page = Page {
+                        subject_area_course_code,
+                        subject_area_course_name,
+                        url,
+                        school,
+                        courses: Vec::new(),
+                    };
+
+                    match &mut self.pages {
+                        Some(curr_pages) => {
+                            curr_pages.push(page);
+                        }
+                        None => {
+                            self.pages = Some(vec![page]);
+                        }
+                    }
+                }
+
+                println!("{:?}", self.pages);
+                Ok(())
+            }
+            None => Err(Box::new(UrlInvalidError)),
         }
-      }
-      
-  }
+    }
 }
 
 impl Scraper {
     pub fn view_scraper(&self) {
         println!("{:?}", self);
     }
-}
\ No newline at end of file
+}
+
+impl Default for Scraper {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+fn extract_text(node: ElementRef) -> String {
+    node.text().collect::<String>()
+}
+
+fn get_html_link_to_page(html_fragment: &str) -> String {
+    "https://timetable.unsw.edu.au/2024/".to_string() + html_fragment
+}
diff --git a/src/url_invalid_error.rs b/src/url_invalid_error.rs
index 3165a7e..11c6762 100644
--- a/src/url_invalid_error.rs
+++ b/src/url_invalid_error.rs
@@ -1,4 +1,3 @@
-
 #[derive(Debug)]
 pub struct UrlInvalidError;