diff --git a/README.md b/README.md new file mode 100644 index 0000000..05eb17c --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +# Freerooms Scrapers + +This repository contains the scrapers for the underlying buildings, rooms and bookings data used by Freerooms. + +For documentation on the internals of each scraper, please see the READMEs in the respective directories for each scraper. + +For instructions on how you can access this data, see the [DevSoc GraphQL API](https://github.com/devsoc-unsw/graphql-api). + +## Schema + +### Buildings + +| **Field** | **Description** | **Example** | +|-----------|----------------------------------------------|-----------------| +| `id` | Building ID in the format `CAMPUS-GRID_REF`. | "K-F8" | +| `name` | Name of the building. | "Law Building" | +| `lat` | Latitude of the building. | -33.91700 | +| `long` | Longitude of the building. | 151.227791 | +| `aliases` | List of alternative names for the building. | ["Law Library"] | + +### Rooms + +| **Field** | **Description** | **Example** | +|--------------|---------------------------------------------------|---------------------| +| `id` | Room ID in the format `CAMPUS-GRID_REF-ROOM_NUM`. | "K-J17-305" | +| `name` | Name of the room. | "Brass Lab J17 305" | +| `abbr` | Shortened name, as seen on timetable. | "BrassME305" | +| `usage` | Room type - see below for list. | "CMLB" | +| `capacity` | Number of people the room is suitable for. | 36 | +| `school` | School that manages the room - `" "` if none. | "CSE" | +| `buildingId` | ID of building that room is in. | "K-J17" | + +Mapping of room usages can be found [here](https://github.com/devsoc-unsw/freerooms/blob/dev/common/roomUsages.ts). Mapping of school codes can be found [here](https://github.com/devsoc-unsw/freerooms/blob/dev/common/schools.ts). + +### Bookings + +| **Field** | **Description** | **Example** | +|---------------|------------------------------------------------------|-----------------------------| +| `bookingType` | Type of booking - see below. | "SOCIETY" | +| `name` | Name of the booking (usually related to the booker). | "SOFTWAREDEV" | +| `roomId` | ID of the room the booking is for. | "K-E19-G05" | +| `start` | Start time of the booking. | "2024-01-27T04:00:00+00:00" | +| `end` | End time of the booking. | "2024-01-27T08:00:00+00:00" | + +Full list of current booking types is: "CLASS", "SOCIETY", "INTERNAL", "LIB", "BLOCK", "MISC". + +### Relationships +The following relationships exist between tables. These relationships are tracked by Hasura and can be followed in GraphQL queries. +- Every **building** contains 1 or more **rooms** +- Every **room** belongs to a **building** +- Every **room** has 0 or more **bookings** +- Every **booking** is for a specific **room** \ No newline at end of file diff --git a/libcal/README.md b/libcal/README.md index 075b6c3..08f5a64 100644 --- a/libcal/README.md +++ b/libcal/README.md @@ -1,2 +1,11 @@ # libcal-scraper -Scraper for https://unswlibrary-bookings.libcal.com/ +Scraper for https://unswlibrary-bookings.libcal.com/. This scrapes data on study rooms in the Main Library and Law Library, as well as the bookings for these rooms. + +## Data Sources + +The booking data is found on this page: +https://unswlibrary-bookings.libcal.com/r/new/availability?lid=6581&zone=0&gid=0&capacity=0 + +However, the data is not scraped from the HTML, but instead fetched by mimicking the API call made by this page to https://unswlibrary-bookings.libcal.com/spaces/availability/grid. This returns a list of "slots" with item (room) IDs and start/end times, some of which are booked. + +The list of item IDs from the above JSON is then used to scrape the data for each room from the individual room pages (e.g. https://unswlibrary-bookings.libcal.com/space/46481). This is scraped from the HTML using [cheerio](https://cheerio.js.org/). diff --git a/libcal/src/libraryScraper.ts b/libcal/src/libraryScraper.ts index d46be3e..77af46a 100644 --- a/libcal/src/libraryScraper.ts +++ b/libcal/src/libraryScraper.ts @@ -1,5 +1,5 @@ import { load } from 'cheerio'; -import { RoomBooking, Room } from "./types"; +import { Library, Room, RoomBooking } from "./types"; import toSydneyTime from "./toSydneyTime"; import axios from "axios"; import * as fs from 'fs'; @@ -7,218 +7,215 @@ import { DRYRUN, HASURAGRES_API_KEY, HASURAGRES_URL } from './config'; const ROOM_URL = "https://unswlibrary-bookings.libcal.com/space/"; const BOOKINGS_URL = "https://unswlibrary-bookings.libcal.com/spaces/availability/grid"; -const LIBRARIES = [ - { name: 'Main Library', libcalCode: '6581', buildingId: 'K-F21' }, - { name: 'Law Library', libcalCode: '6584', buildingId: 'K-F8' }, +const LIBRARIES: Library[] = [ + { name: 'Main Library', libcalCode: '6581', buildingId: 'K-F21' }, + { name: 'Law Library', libcalCode: '6584', buildingId: 'K-F8' }, ]; -const scrapeLibrary = async (library: typeof LIBRARIES[number]) => { - const response = await downloadBookingsPage(library.libcalCode); - const bookingData = parseBookingData(response.data['slots']); +const scrapeLibrary = async (library: Library) => { + const response = await downloadBookingsPage(library.libcalCode); + const bookingData = parseBookingData(response.data['slots']); - const allRoomData: Room[] = []; - const allRoomBookings: RoomBooking[] = []; + const allRoomData: Room[] = []; + const allRoomBookings: RoomBooking[] = []; - for (const roomID in bookingData) { - let roomData: Room | null = null; - try { - roomData = await getRoomData(roomID, library.buildingId); - } catch (e) { - console.warn(`Failed to scrape room ${roomID} in ${library.buildingId}`); - } + for (const roomID in bookingData) { + let roomData: Room | null = null; + try { + roomData = await getRoomData(roomID, library.buildingId); + } catch (e) { + console.warn(`Failed to scrape room ${roomID} in ${library.buildingId}`); + } - if (!roomData) continue; // skipping non-rooms - allRoomData.push(roomData); - - let i = 0; - while (i < bookingData[roomID].length) { - const currBooking: RoomBooking = { - bookingType: 'LIB', - name: "Library Booking", - roomId: roomData.id, - start: bookingData[roomID][i].start, - end: bookingData[roomID][i].end, - } - i++; - - // Combine all subsequent bookings that start when this ends - while (i < bookingData[roomID].length && - bookingData[roomID][i].start.getTime() == currBooking.end.getTime()) { - currBooking.end = bookingData[roomID][i].end; - i++; - } - - allRoomBookings.push(currBooking); - } + if (!roomData) continue; // skipping non-rooms + allRoomData.push(roomData); + + let i = 0; + while (i < bookingData[roomID].length) { + const currBooking: RoomBooking = { + bookingType: 'LIB', + name: "Library Booking", + roomId: roomData.id, + start: bookingData[roomID][i].start, + end: bookingData[roomID][i].end, + } + i++; + + // Combine all subsequent bookings that start when this ends + while (i < bookingData[roomID].length && + bookingData[roomID][i].start.getTime() == currBooking.end.getTime()) { + currBooking.end = bookingData[roomID][i].end; + i++; + } + + allRoomBookings.push(currBooking); } + } - return { rooms: allRoomData, bookings: allRoomBookings }; + return { rooms: allRoomData, bookings: allRoomBookings }; } // Formats a date into YYYY-MM-DD format const formatDate = (date: Date): string => { - const year = date.getFullYear(); - const month = String(date.getMonth() + 1).padStart(2, '0'); - const day = String(date.getDate()).padStart(2, '0'); + const year = date.getFullYear(); + const month = String(date.getMonth() + 1).padStart(2, '0'); + const day = String(date.getDate()).padStart(2, '0'); - return `${year}-${month}-${day}`; + return `${year}-${month}-${day}`; } -const downloadBookingsPage = async(locationId: string) => { - - const todaysDate = formatDate(new Date()); - - // Furthest seems to be 2 weeks in the future - const twoWeeks = new Date(); - twoWeeks.setDate(twoWeeks.getDate() + 14); - const furthestBookableDate = formatDate(twoWeeks); - - const postData = { - lid: locationId, - gid: '0', - eid: '-1', - seat: '0', - seatId: '0', - zone: '0', - start: todaysDate, - end: furthestBookableDate, - pageIndex: '0', - pageSize: '18' - }; - - const headers = { - 'Content-Type': 'application/x-www-form-urlencoded', // because the request data is URL encoded - 'Referer': 'https://unswlibrary-bookings.libcal.com/' - }; - - return await axios.post(BOOKINGS_URL, new URLSearchParams(postData), { headers }); +const downloadBookingsPage = async (locationId: string) => { + const todaysDate = formatDate(new Date()); + + // Furthest seems to be 2 weeks in the future + const twoWeeks = new Date(); + twoWeeks.setDate(twoWeeks.getDate() + 14); + const furthestBookableDate = formatDate(twoWeeks); + + const postData = { + lid: locationId, + gid: '0', + eid: '-1', + seat: '0', + seatId: '0', + zone: '0', + start: todaysDate, + end: furthestBookableDate, + pageIndex: '0', + pageSize: '18' + }; + + const headers = { + 'Content-Type': 'application/x-www-form-urlencoded', // because the request data is URL encoded + 'Referer': 'https://unswlibrary-bookings.libcal.com/' + }; + + return await axios.post(BOOKINGS_URL, new URLSearchParams(postData), { headers }); } interface ResponseData { - start: string, - end: string, - itemId: number, - checksum: string, - className?: string + start: string, + end: string, + itemId: number, + checksum: string, + className?: string } const parseBookingData = (bookingData: ResponseData[]) => { + const bookings: { [roomNumber: number]: { start: Date, end: Date }[] } = {}; - const bookings: { [roomNumber: number]: { start: Date, end: Date }[] } = {}; - - for (const slot of bookingData) { - if (!(slot.itemId in bookings)) { - bookings[slot.itemId] = []; - } + for (const slot of bookingData) { + if (!(slot.itemId in bookings)) { + bookings[slot.itemId] = []; + } - if (slot.className == "s-lc-eq-checkout") { - bookings[slot.itemId].push( - { - start: toSydneyTime(new Date(slot.start)), - end: toSydneyTime(new Date(slot.end)), - } - ) + if (slot.className == "s-lc-eq-checkout") { + bookings[slot.itemId].push( + { + start: toSydneyTime(new Date(slot.start)), + end: toSydneyTime(new Date(slot.end)), } + ) } + } - return bookings; + return bookings; } const getRoomData = async (roomId: string, buildingId: string) => { - - const response = await axios.get(ROOM_URL + roomId, {}); - const $ = load(response.data); - - const $heading = $('h1#s-lc-public-header-title'); - - // Remove whitespace and split the name, location and capacity into newlines - const data = $heading.text().trim().split(/\s{2,}/g); - const [name, rawLocation, rawCapacity] = data; - - // We only care about rooms and pods - if (!name.match(/RM|POD/)) { - return null; - } - - const libraryName = rawLocation.replace(/[()]/, '').split(':')[0]; - const capacity = parseInt(rawCapacity.split(": ")[1]); - let roomNumber = name.split(' ')[2]; - if (name.match(/POD/)) { - // Pods are just numbered 1-8 so prepend POD - roomNumber = 'POD' + roomNumber; - } - - const roomData: Room = { - name: libraryName + ' ' + name, - abbr: name, - id: buildingId + "-" + roomNumber, - usage: "LIB", - capacity, - school: " ", - buildingId: buildingId - } - - return roomData; + const response = await axios.get(ROOM_URL + roomId, {}); + const $ = load(response.data); + + const $heading = $('h1#s-lc-public-header-title'); + + // Remove whitespace and split the name, location and capacity into newlines + const data = $heading.text().trim().split(/\s{2,}/g); + const [name, rawLocation, rawCapacity] = data; + + // We only care about rooms and pods + if (!name.match(/RM|POD/)) { + return null; + } + + const libraryName = rawLocation.replace(/[()]/, '').split(':')[0]; + const capacity = parseInt(rawCapacity.split(": ")[1]); + let roomNumber = name.split(' ')[2]; + if (name.match(/POD/)) { + // Pods are just numbered 1-8 so prepend POD + roomNumber = 'POD' + roomNumber; + } + + const roomData: Room = { + name: libraryName + ' ' + name, + abbr: name, + id: buildingId + "-" + roomNumber, + usage: "LIB", + capacity, + school: " ", + buildingId: buildingId + } + + return roomData; } const runScrapeJob = async () => { - console.time('Scraping'); - const allRooms: Room[] = []; - const allBookings: RoomBooking[] = []; - for (const library of LIBRARIES) { - const { rooms, bookings } = await scrapeLibrary(library); - allRooms.push(...rooms); - allBookings.push(...bookings); + console.time('Scraping'); + const allRooms: Room[] = []; + const allBookings: RoomBooking[] = []; + for (const library of LIBRARIES) { + const { rooms, bookings } = await scrapeLibrary(library); + allRooms.push(...rooms); + allBookings.push(...bookings); + } + console.timeEnd('Scraping'); + + // Send to Hasuragres + const requestConfig = { + headers: { + "Content-Type": "application/json", + "X-API-Key": HASURAGRES_API_KEY, } - console.timeEnd('Scraping'); - - // Send to Hasuragres - const requestConfig = { - headers: { - "Content-Type": "application/json", - "X-API-Key": HASURAGRES_API_KEY, - } - } - - await axios.post( - `${HASURAGRES_URL}/insert`, - { - metadata: { - table_name: "Rooms", - columns: ["abbr", "name", "id", "usage", "capacity", "school", "buildingId"], - sql_up: fs.readFileSync("./sql/rooms/up.sql", "utf8"), - sql_down: fs.readFileSync("./sql/rooms/down.sql", "utf8"), - // overwrite all outdated lib rooms - sql_before: "DELETE FROM Rooms WHERE \"usage\" = 'LIB' " + - `AND "id" NOT IN (${allRooms.map(room => `'${room.id}'`).join(",")});`, - write_mode: 'append', - dryrun: DRYRUN, - }, - payload: allRooms + } + + await axios.post( + `${HASURAGRES_URL}/insert`, + { + metadata: { + table_name: "Rooms", + columns: ["abbr", "name", "id", "usage", "capacity", "school", "buildingId"], + sql_up: fs.readFileSync("./sql/rooms/up.sql", "utf8"), + sql_down: fs.readFileSync("./sql/rooms/down.sql", "utf8"), + // overwrite all outdated lib rooms + sql_before: "DELETE FROM Rooms WHERE \"usage\" = 'LIB' " + + `AND "id" NOT IN (${allRooms.map(room => `'${room.id}'`).join(",")});`, + write_mode: 'append', + dryrun: DRYRUN, }, - requestConfig - ); - - // libcal shows all bookings that start during or after the current 30-min period - const baseTime = new Date(); - baseTime.setMinutes(baseTime.getMinutes() < 30 ? 0 : 30, 0, 0); - await axios.post( - `${HASURAGRES_URL}/insert`, - { - metadata: { - table_name: "Bookings", - columns: ["bookingType", "name", "roomId", "start", "end"], - sql_up: fs.readFileSync("./sql/bookings/up.sql", "utf8"), - sql_down: fs.readFileSync("./sql/bookings/down.sql", "utf8"), - sql_before: fs.readFileSync("./sql/bookings/before.sql", "utf8"), - sql_after: fs.readFileSync("./sql/bookings/after.sql", "utf8"), - write_mode: 'append', - dryrun: DRYRUN, - }, - payload: allBookings + payload: allRooms + }, + requestConfig + ); + + // libcal shows all bookings that start during or after the current 30-min period + const baseTime = new Date(); + baseTime.setMinutes(baseTime.getMinutes() < 30 ? 0 : 30, 0, 0); + await axios.post( + `${HASURAGRES_URL}/insert`, + { + metadata: { + table_name: "Bookings", + columns: ["bookingType", "name", "roomId", "start", "end"], + sql_up: fs.readFileSync("./sql/bookings/up.sql", "utf8"), + sql_down: fs.readFileSync("./sql/bookings/down.sql", "utf8"), + sql_before: fs.readFileSync("./sql/bookings/before.sql", "utf8"), + sql_after: fs.readFileSync("./sql/bookings/after.sql", "utf8"), + write_mode: 'append', + dryrun: DRYRUN, }, - requestConfig - ); + payload: allBookings + }, + requestConfig + ); } runScrapeJob(); diff --git a/libcal/src/types.ts b/libcal/src/types.ts index ee4a4ca..7973f48 100644 --- a/libcal/src/types.ts +++ b/libcal/src/types.ts @@ -6,12 +6,18 @@ export type RoomBooking = { end: Date; } - export type Room = { - abbr: string; - name: string; - id: string; - usage: string; - capacity: number; - school: string; - buildingId: string; - } +export type Room = { + abbr: string; + name: string; + id: string; + usage: string; + capacity: number; + school: string; + buildingId: string; +} + +export type Library = { + name: string; + libcalCode: string; + buildingId: string; +} diff --git a/nss/README.md b/nss/README.md index 151bcdd..ea292ed 100644 --- a/nss/README.md +++ b/nss/README.md @@ -1,2 +1,12 @@ # nss-scraper -scraper for https://nss.cse.unsw.edu.au/tt/ +Scraper for https://nss.cse.unsw.edu.au/tt/. This scrapes the data for all buildings and rooms managed by the UNSW central timetabling system, as well as all bookings on this system. This includes classes and society bookings as well as others. + +## Data Sources + +Building IDs and names are scraped from the dropdown list of buildings on https://nss.cse.unsw.edu.au/tt/view_multirooms.php?dbafile=2024-KENS-COFA.DBA&campus=KENS. The coordinates and aliases of buildings are manually supplied in `buildingOverrides.json`. + +Room data is all scraped from doing a search on https://nss.cse.unsw.edu.au/tt/find_rooms.php?dbafile=2024-KENS-COFA.DBA&campus=KENS. To get all rooms to show up, set the search parameter to be all days and set the start/end time to be equal. + +Bookings are scraped separately for each room from the individual room pages (e.g. https://nss.cse.unsw.edu.au/tt/view_rooms.php?dbafile=2024-KENS-COFA.DBA&campus=KENS). By setting the date range to be the whole year, each booking will show a bit string on hover (HTML `title` element) describing which weeks of the year it runs in. + +Some buildings and rooms are ignored, which can be seen and configured in `src/exclusions.ts`. \ No newline at end of file