From 77e2f1f7d787baaa761cb4bed965fc0391a7989f Mon Sep 17 00:00:00 2001 From: Ray Date: Thu, 4 Jan 2024 21:40:39 +1100 Subject: [PATCH] Can scrape hrefs --- src/lib.rs | 6 +- src/main.rs | 12 +-- src/scraper.rs | 185 +++++++++++++++++++++++++++------------ src/url_invalid_error.rs | 1 - 4 files changed, 137 insertions(+), 67 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 33a5a85..ddac5bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ -mod url_invalid_error; mod scraper; +mod url_invalid_error; - - +pub use scraper::Scraper; pub use url_invalid_error::UrlInvalidError; -pub use scraper::Scraper; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 74e18e8..08d50a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,14 @@ use spooderman::Scraper; - #[tokio::main] async fn main() { - let mut scraper = Scraper::new() - .set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string()); - + let mut scraper = + Scraper::new().set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string()); + match scraper.run_scraper().await { - Ok(_res) => {println!("Scraping successful!\n"); - }, + Ok(_res) => { + println!("Scraping successful!\n"); + } Err(e) => eprintln!("Error: {}", e), } } diff --git a/src/scraper.rs b/src/scraper.rs index 01c7ada..3d38159 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -1,58 +1,82 @@ -use std::ops::Add; use chrono::{DateTime, Utc}; use reqwest::ClientBuilder; -use scraper::Html; +use scraper::{html, ElementRef, Selector}; +use std::ops::Add; use crate::UrlInvalidError; - #[derive(Debug)] enum Term { - T1, T2, T3, Summer + T1, + T2, + T3, + Summer, } #[derive(Debug)] -enum Status { Open, Closed } +enum Status { + Open, + Closed, +} #[derive(Debug)] -struct Enrolment { enrolled: u32, capacity: u32 } +struct Enrolment { + enrolled: u32, + capacity: u32, +} #[derive(Debug)] -struct TimeBlock { start: (u32, u32), end: (u32, u32) } +struct TimeBlock { + start: (u32, u32), + end: (u32, u32), +} impl Add for TimeBlock { type Output = TimeBlock; - + fn add(self, another: TimeBlock) -> Self { let add_hours = |a, b| (a + b) % 24; let add_minutes = |a, b| (a + b) % 60; Self { - start: (add_hours(self.start.0, another.start.0), add_minutes(self.start.1, another.start.1)), - end: (add_hours(self.end.0, another.end.0), add_minutes(self.end.1, another.end.1)) + start: ( + add_hours(self.start.0, another.start.0), + add_minutes(self.start.1, another.start.1), + ), + end: ( + add_hours(self.end.0, another.end.0), + add_minutes(self.end.1, another.end.1), + ), } } } #[derive(Debug)] -struct DateBlock { start: DateTime, end: DateTime } - +struct DateBlock { + start: DateTime, + end: DateTime, +} #[derive(Debug)] -enum Day { - Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday +enum Day { + Sunday, + Monday, + Tuesday, + Wednesday, + Thursday, + Friday, + Saturday, } #[derive(Debug)] -struct ClassTimeBlock { - day: Day, - weeks: String, - time: TimeBlock, - location: String, +pub struct ClassTimeBlock { + day: Day, + weeks: String, + time: TimeBlock, + location: String, } - #[derive(Debug)] -struct Class { +pub struct Class { class_id: u32, section: String, term: Term, @@ -61,14 +85,18 @@ struct Class { course_enrolment: Enrolment, term_date: DateBlock, mode: String, - times: Vec + times: Vec, } #[derive(Debug)] -enum Career { UG, PG, RESEARCH } +enum Career { + UG, + PG, + RESEARCH, +} #[derive(Debug)] -struct Course { +pub struct Course { code: String, name: String, campus: Career, @@ -80,27 +108,25 @@ struct Course { } #[derive(Debug)] -struct Page { - url: String, - subject_area_course_code: String, - subject_area_course_name: String, - school: String, - courses: Vec, +pub struct Page { + url: String, + subject_area_course_code: String, + subject_area_course_name: String, + school: String, + courses: Vec, } - #[derive(Debug)] pub struct Scraper { url: Option, pages: Option>, } - impl Scraper { pub fn new() -> Self { Scraper { url: None, - pages: None, + pages: Some(Vec::new()), } } @@ -116,35 +142,82 @@ impl Scraper { self } - - async fn fetch_url(&self, url: &str) -> Result> { - let client = ClientBuilder::new().danger_accept_invalid_certs(true).build()?; - let response = client.get(url).send().await?; - let body = response.text().await?; - Ok(body) + async fn fetch_url(&self, url: &str) -> Result> { + let client = ClientBuilder::new() + .danger_accept_invalid_certs(true) + .build()?; + let response = client.get(url).send().await?; + let body = response.text().await?; + Ok(body) } - pub async fn run_scraper(&mut self) -> Result> { - match &self.url { - Some(url) => { - let html = self.fetch_url(url).await?; - println!("{}", html); - let html_course_selector = scraper::Selector::parse("tr.rowLowlight td.data").unwrap(); - let doc = scraper::Html::parse_document(&html); - let res: Vec<_> = doc.select(&html_course_selector).flat_map(|el| el.text()).collect(); - println!("{:?}", res); - Ok(doc) - } - None => { - Err(Box::new(UrlInvalidError)) + pub async fn run_scraper(&mut self) -> Result<(), Box> { + match &self.url { + Some(url) => { + let html = self.fetch_url(url).await?; + println!("{}", html); + let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap(); + let code_selector = Selector::parse("td.data").unwrap(); + let name_selector = Selector::parse("td.data a").unwrap(); + let link_selector = Selector::parse("td.data a").unwrap(); + let school_selector = Selector::parse("td.data:nth-child(3)").unwrap(); + let document = scraper::Html::parse_document(&html); + for row_node in document.select(&row_selector) { + // Extract data from each row + let subject_area_course_code = + extract_text(row_node.select(&code_selector).next().unwrap()); + let subject_area_course_name = + extract_text(row_node.select(&name_selector).next().unwrap()); + let url = get_html_link_to_page( + row_node + .select(&link_selector) + .next() + .map_or("", |node| node.value().attr("href").unwrap_or("")), + ); + let school = extract_text(row_node.select(&school_selector).next().unwrap()); + // Create a Course struct and push it to the vector + let page = Page { + subject_area_course_code, + subject_area_course_name, + url, + school, + courses: Vec::new(), + }; + + match &mut self.pages { + Some(curr_pages) => { + curr_pages.push(page); + } + None => { + self.pages = Some(vec![page]); + } + } + } + + println!("{:?}", self.pages); + Ok(()) + } + None => Err(Box::new(UrlInvalidError)), } - } - - } + } } impl Scraper { pub fn view_scraper(&self) { println!("{:?}", self); } -} \ No newline at end of file +} + +impl Default for Scraper { + fn default() -> Self { + Self::new() + } +} + +fn extract_text(node: ElementRef) -> String { + node.text().collect::() +} + +fn get_html_link_to_page(html_fragment: &str) -> String { + "https://timetable.unsw.edu.au/2024/".to_string() + html_fragment +} diff --git a/src/url_invalid_error.rs b/src/url_invalid_error.rs index 3165a7e..11c6762 100644 --- a/src/url_invalid_error.rs +++ b/src/url_invalid_error.rs @@ -1,4 +1,3 @@ - #[derive(Debug)] pub struct UrlInvalidError;