Skip to content

Commit

Permalink
debugged
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayahhhmed committed Aug 24, 2024
1 parent 6e470db commit bbf3070
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 322 deletions.
268 changes: 106 additions & 162 deletions SomeFile.txt

Large diffs are not rendered by default.

87 changes: 18 additions & 69 deletions src/class_scraper.rs
Original file line number Diff line number Diff line change
@@ -1,29 +1,14 @@
use std::collections::HashMap;

use log::info;
use scraper::{Html, Selector};
use select::document;
use scraper::Selector;

use crate::{
scraper::{fetch_url},
text_manipulators::{extract_text, get_html_link_to_page},
Scraper, UrlInvalidError,
scraper::fetch_url,
text_manipulators::extract_text,
};

#[derive(Debug)]
pub enum Career {
UG,
PG,
RESEARCH,
}

#[derive(Debug)]
pub enum Term {
T1,
T2,
T3,
Summer,
}

#[derive(Debug)]
pub struct Course {
Expand All @@ -34,10 +19,8 @@ pub struct Course {
school: Option<String>,
campus: Option<String>,
career: Option<String>,
terms: Vec<Term>,
census_dates: Vec<String>,
terms: Vec<String>,
classes: Vec<Class>,
notes: Option<String>,
}


Expand Down Expand Up @@ -88,9 +71,8 @@ pub struct ClassScraper {
pub url: String,
}


impl Scraper for ClassScraper {
async fn scrape(&mut self) -> Result<(), Box<dyn std::error::Error>> {
impl ClassScraper {
pub async fn scrape(&mut self) -> Result<Course, Box<dyn std::error::Error>> {
let html = fetch_url(&self.url).await?;

let document = scraper::Html::parse_document(&html);
Expand All @@ -109,19 +91,16 @@ impl Scraper for ClassScraper {
campus: None,
career: None,
terms: vec![],
census_dates: vec![],
classes: vec![],
notes: None
};

/*
* This is for the banner information.
*/
for row in information_body.select(&table_selector) {
let labels: Vec<_> = row.select(&label_selector).map(|el| el.text().collect::<Vec<_>>().join("")).collect();
let data: Vec<_> = row.select(&data_selector).map(|el| el.text().collect::<Vec<_>>().join("")).collect();

// Print or process the extracted labels and data
for (label, data) in labels.iter().zip(data.iter()) {
// println!("Label: {}, Data: {}", label, data);

match label.trim().to_lowercase().as_str() {
"faculty" => course_info.faculty = Some(data.clone()),
"school" => course_info.school = Some(data.clone()),
Expand All @@ -132,32 +111,16 @@ impl Scraper for ClassScraper {
}

}

// let term_course_information_table = Selector::parse("td.formBody td.formBody table:nth-of-type(3) tbody tr").unwrap();
let term_course_information_table = Selector::parse("td.formBody td.formBody table:nth-of-type(3) tbody").unwrap();

let valid_row_data_len = 1;
for row in document.select(&term_course_information_table) {
let cell_selector = Selector::parse("td.data").unwrap();
let cells: Vec<_> = row
.select(&cell_selector)
.map(|cell| cell.text().collect::<Vec<_>>().join("").trim().replace("\u{a0}", ""))
.filter(|text| !text.is_empty())
.collect();
if cells.len() <= valid_row_data_len {
continue;
}

let duplicate_term_removed = cells[1..].to_vec();

// println!("{:?}", duplicate_term_removed);
// Parse top header for term data
let term_course_information_table = Selector::parse("td.formBody td.formBody table:nth-of-type(3) tbody tbody").unwrap();
for row in document.select(&term_course_information_table) {
course_info.terms.push(extract_text(row).trim().replace("\u{a0}", ""));
}

// let term_course_information_table = Selector::parse("td.formBody td.formBody table:nth-of-type(3) tbody tr").unwrap();
let term_course_information_table = Selector::parse("td.formBody td.formBody table").unwrap();

let term_count = 3;
let skip_count = 3 + term_count + 3 * term_count; //
let term_count = course_info.terms.len();
// 3 is a magical pattern number that is consistent with the way the handbook is setup.
let skip_count = 3 + term_count + 3 * term_count;
let mut class_activity_information = vec![];
for row in document.select(&term_course_information_table).skip(skip_count) {
let cell_selector = Selector::parse("td.label, td.data").unwrap();
Expand All @@ -172,12 +135,11 @@ impl Scraper for ClassScraper {
class_activity_information.push(cell);
}
}
println!("{:?}", parse_class_info(class_activity_information));

course_info.classes = parse_class_info(class_activity_information);



Ok(())
Ok(course_info)
}
}

Expand Down Expand Up @@ -247,16 +209,3 @@ fn parse_meeting_info(map: &HashMap<String, String>) -> Option<Vec<Time>> {
Some(meetings)
}
}



// impl ClassScraper {
// pub fn new(url: String) -> Self {
// ClassScraper {
// subject_area_course_code: todo!(),
// subject_area_course_name: todo!(),
// uoc: todo!(),
// url,
// }
// }
// }
6 changes: 3 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
mod scraper;
mod url_invalid_error;

mod school_area_scraper;
mod class_scraper;
mod school_area_scraper;
mod subject_area_scraper;
mod text_manipulators;

pub use scraper::Scraper;
pub use scraper::fetch_url;
pub use scraper::Scraper;
pub use url_invalid_error::UrlInvalidError;
// pub use subject_area_scraper::SubjectAreaScraper;
pub use class_scraper::ClassScraper;
pub use school_area_scraper::SchoolAreaScraper;
pub use subject_area_scraper::SubjectAreaScraper;
pub use class_scraper::ClassScraper;
129 changes: 55 additions & 74 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
use spooderman::{ClassScraper, SchoolAreaScraper, Scraper, SubjectAreaScraper};
use chrono::{Datelike, Utc};
use dotenv::dotenv;
use futures::stream::{FuturesUnordered, StreamExt};
use regex::Regex;
use chrono::{Datelike, Utc};
use std::collections::HashMap;
use futures::stream::{StreamExt, FuturesUnordered};
use reqwest::Client;
use serde::{Deserialize, Serialize};
use serde_json::json;
use spooderman::{ClassScraper, SchoolAreaScraper, Scraper, SubjectAreaScraper};
use std::collections::HashMap;
use std::env;
use std::error::Error;
use std::vec;


extern crate log;
extern crate env_logger;
extern crate log;

use log::LevelFilter;

use log::{info, warn, error};
use log::{error, info, warn};

type CourseCode = String;
type FacultyCode = String;


#[derive(Serialize, Deserialize)]
struct Metadata {
table_name: String,
Expand All @@ -40,10 +39,11 @@ struct BatchInsertRequest {
payload: Vec<serde_json::Value>,
}


pub fn mutate_string_to_include_curr_year(curr_base_url: &mut String) -> String {
pub fn mutate_string_to_include_curr_year(curr_base_url: &mut String) -> String {
let pattern = Regex::new("year").unwrap();
pattern.replace(&curr_base_url, Utc::now().year().to_string()).to_string()
pattern
.replace(&curr_base_url, Utc::now().year().to_string())
.to_string()
}

async fn send_batch_data() -> Result<(), Box<dyn Error>> {
Expand Down Expand Up @@ -86,15 +86,13 @@ async fn send_batch_data() -> Result<(), Box<dyn Error>> {
},
];


let response = client
.post(format!("{}/batch_insert", hasuragres_url))
.header("X-API-Key", api_key)
.json(&requests)
.send()
.await?;


if response.status().is_success() {
println!("Batch data inserted successfully!");
} else {
Expand All @@ -104,7 +102,6 @@ async fn send_batch_data() -> Result<(), Box<dyn Error>> {
Ok(())
}


async fn run_all_school_offered_courses_scraper_job() -> Option<SchoolAreaScraper> {
match std::env::var("TIMETABLE_API_URL") {
Ok(url) => {
Expand All @@ -120,76 +117,60 @@ async fn run_all_school_offered_courses_scraper_job() -> Option<SchoolAreaScrape
}
}

async fn run_school_courses_page_scraper_job() -> SchoolAreaScraper {
// if let Some(school_area_scrapers) = all_school_offered_courses_scraper.as_mut() {
// for school_area_page in &mut school_area_scrapers.pages {
// let _ = school_area_page.subject_area_scraper.scrape().await;
// println!("TEEHEE");
// for mut class_scrapers in &mut school_area_page.subject_area_scraper.class_scrapers {
async fn run_school_courses_page_scraper_job(
all_school_offered_courses_scraper: &mut SchoolAreaScraper,
) {
for school_area_scrapers in &mut all_school_offered_courses_scraper.pages {
let _ = school_area_scrapers.subject_area_scraper.scrape().await;
}
}

// }
// // Construct Courses from here and transfer ownership

async fn run_course_classes_page_scraper_job(
all_school_offered_courses_scraper: &mut SchoolAreaScraper,
) -> Vec<Course> {
let mut courses_vec = vec![];
for school_area_scrapers in &mut all_school_offered_courses_scraper.pages {
for course_area_scrapers in &mut school_area_scrapers.subject_area_scraper.class_scrapers {
let courses = course_area_scrapers.scrape().await;
if let Ok(course) = courses {
courses_vec.push(courses);
}
// println!("{:?}", courses);
}
}
return courses_vec;
}

async fn test_scrape() {
let mut scraper = ClassScraper {
subject_area_course_code: "COMP1511".to_string(),
subject_area_course_name: "COMP1511".to_string(),
uoc: 6,
// url: "https://timetable.unsw.edu.au/2024/ACCT2101.html".to_string(),
url: "https://timetable.unsw.edu.au/2024/ACCT5997.html".to_string(),
};
let c = scraper.scrape().await;
print!("{:?}", c);
// for school_area_scrapers in &mut all_school_offered_courses_scraper.pages {
// for course_area_scrapers in &mut school_area_scrapers.subject_area_scraper.class_scrapers {
// let courses = course_area_scrapers.scrape().await;
// println!("{:?}", courses);
// }
// }
}

#[tokio::main]
async fn main() {
dotenv().ok();
env_logger::Builder::new()
.filter_level(LevelFilter::Info)
.init();
let class_vec = vec![];
let course_vec: vec![];
// let class_vec = vec![];
// let course_vec: vec![];
let mut all_school_offered_courses_scraper = run_all_school_offered_courses_scraper_job().await;

if let Some(all_school_offered_courses_scraper) = &mut all_school_offered_courses_scraper {
run_school_courses_page_scraper_job(all_school_offered_courses_scraper).await;
run_course_classes_page_scraper_job(all_school_offered_courses_scraper).await;
}
// test_scrape().await;
println!("{:?}", all_school_offered_courses_scraper);


// let scraping_jobs = FuturesUnordered::new();
// let mut all_school_offered_courses_scraper = run_all_school_offered_courses_scraper_job().await;

// for mut scraper in all_school_offered_courses_scraper {
// scraping_jobs.push(async { scraper.scrape().await });
// }

// scraping_jobs.for_each_concurrent(None, |scrape_future| async {
// scrape_future;
// }).await;
// println!("{:?}", all_school_offered_courses_scraper);
// // The base API url needs the year to be replaced with "year".
// // ie. https://timetable.unsw.edu.au/year/subjectSearch.html
// match std::env::var("TIMETABLE_API_URL") {
// Ok(url) => {
// info!("Timetable URL has been parsed from environment file: {url}!");
// let url_to_scrape = mutate_string_to_include_curr_year(&mut url.to_string());

// let mut scraper = ClassScraper {
// subject_area_course_code: "COMP1511".to_string(),
// subject_area_course_name: "COMP1511".to_string(),
// uoc: 6,
// // url: "https://timetable.unsw.edu.au/2024/ACCT5999.html".to_string(),
// url: "https://timetable.unsw.edu.au/2024/COMP1511.html".to_string(),
// };
// let _ = scraper.scrape().await;
// // let mut scraper = SchoolAreaScraper::new(url_to_scrape);

// // // let mut scraper = SubjectAreaScraper::new("https://timetable.unsw.edu.au/2024/COMPKENS.html".to_string());
// // match scraper.scrape().await {
// // Ok(_) => info!("Scraping successful!\n"),
// // Err(e) => error!("Error: {}", e),
// // }
// // for school_area_page in &mut scraper.pages {
// // let _ = school_area_page.subject_area_scraper.scrape().await;
// // }
// // println!("{:?}", scraper);


// }
// Err(e) => {
// warn!("Timetable URL has NOT been parsed properly from env file and error report: {e}");
// }
// };

}
Loading

0 comments on commit bbf3070

Please sign in to comment.