diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5b928bb7..00000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 41635879..a8fbbdc4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ build/ logs/ node_modules/ +__pycache__/ # Specific Files config.json diff --git a/package-lock.json b/package-lock.json index 826744f3..f6b08421 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", @@ -791,6 +792,20 @@ "tweetnacl": "^0.14.3" } }, + "node_modules/better-sqlite3": { + "version": "12.4.1", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.4.1.tgz", + "integrity": "sha512-3yVdyZhklTiNrtg+4WqHpJpFDd+WHTg2oM7UcR80GqL05AOV0xEJzc6qNvFYoEtE+hRp1n9MpN6/+4yhlGkDXQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + }, + "engines": { + "node": "20.x || 22.x || 23.x || 24.x" + } + }, "node_modules/bignumber.js": { "version": "9.1.2", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.1.2.tgz", diff --git a/package.json b/package.json index 057904ac..b4d1f5c2 100644 --- a/package.json +++ b/package.json @@ -6,12 +6,15 @@ "type": "module", "scripts": { "start:dev": "nodemon --ignore src/data/notifRequests.json src/index.js", - "start": "node src/index.js" + "start": "node src/index.js", + "migrate": "node src/data/scripts/run-migrations.js", + "populate:db": "npm run migrate && python3 src/data/scripts/populate_db.py" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", diff --git a/src/.DS_Store b/src/.DS_Store index a39c96c4..807e9445 100644 Binary files a/src/.DS_Store and b/src/.DS_Store differ diff --git a/src/data/db/database.py b/src/data/db/database.py index efdf541e..9b857b2a 100644 --- a/src/data/db/database.py +++ b/src/data/db/database.py @@ -32,18 +32,54 @@ def insert_library(location, address, latitude, longitude): conn.close() -def insert_printer(location, description, latitude, longitude): +def insert_printer(location, description, labels, latitude, longitude): """Insert a printer into the database.""" conn = get_db_connection() cursor = conn.cursor() + # We remove the "OR IGNORE" because we acknoledge that several printers may have the same location and description (i.e., same building and room), so we rely on the unique printer_id to identify the printer cursor.execute( """ - INSERT OR IGNORE INTO printers (location, description, latitude, longitude) + INSERT INTO printers (location, description, latitude, longitude) VALUES (?, ?, ?, ?) """, (location, description, latitude, longitude), ) + # To get the printer_id, we do NOT rely on the location/description/coordinates, but rather on the printer_id that was just inserted (lastrowid), as several printers may have the same location and description (i.e., same building and room) + printer_id = cursor.lastrowid + + # Insert labels into the labels table and get their IDs + label_ids = [] + for label in labels: + cursor.execute( + """ + INSERT OR IGNORE INTO labels (label) + VALUES (?) + """, + (label,), + ) + cursor.execute( + """ + SELECT id FROM labels WHERE label = ? + """, + (label,), + ) + result = cursor.fetchone() + if result is None: + raise ValueError(f"Failed to find label: {label}") + label_id = result[0] + label_ids.append(label_id) + + # Insert into junction table + for label_id in label_ids: + cursor.execute( + """ + INSERT OR IGNORE INTO printer_labels (printer_id, label_id) + VALUES (?, ?) + """, + (printer_id, label_id), + ) + conn.commit() conn.close() diff --git a/src/data/db/models.py b/src/data/db/models.py index 7634fd0e..8183be91 100644 --- a/src/data/db/models.py +++ b/src/data/db/models.py @@ -31,7 +31,7 @@ def create_tables(): """ CREATE TABLE IF NOT EXISTS printers ( id INTEGER PRIMARY KEY AUTOINCREMENT, - location TEXT UNIQUE, + location TEXT, description TEXT, latitude REAL, longitude REAL diff --git a/src/data/migrations/2025117_1854_create_labels.sql b/src/data/migrations/2025117_1854_create_labels.sql new file mode 100644 index 00000000..3884e988 --- /dev/null +++ b/src/data/migrations/2025117_1854_create_labels.sql @@ -0,0 +1,6 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS labels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + label TEXT UNIQUE NOT NULL +); \ No newline at end of file diff --git a/src/data/migrations/2025117_1859_create_printer_labels.sql b/src/data/migrations/2025117_1859_create_printer_labels.sql new file mode 100644 index 00000000..73fd9c06 --- /dev/null +++ b/src/data/migrations/2025117_1859_create_printer_labels.sql @@ -0,0 +1,9 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS printer_labels ( + printer_id INTEGER NOT NULL, + label_id INTEGER NOT NULL, + PRIMARY KEY (printer_id, label_id), + FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE, + FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE +); \ No newline at end of file diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index e972046f..ea40cd69 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -1,37 +1,292 @@ import requests -from bs4 import BeautifulSoup +from difflib import get_close_matches # For data scraping +from difflib import SequenceMatcher +import re # For using regex +import unicodedata # Handles text encoding at Unicode level # URL of the CU Print directory page -URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL +# URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL -def scrape_printers(): - # Send a GET request to fetch the HTML content - response = requests.get(URL) - soup = BeautifulSoup(response.text, 'html.parser') +URL = 'https://www.cornell.edu/about/maps/directory/text-data.cfm?layer=CUPrint&caption=%20CU%20Print%20Printers' + +# HTTP headers to mimic a real browser request +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36", + "Referer": 'https://www.cornell.edu/about/maps/directory/', + "X-Requested-With": 'XMLHttpRequest', + "Accept": 'application/json, text/javascript, */*', +} + +# Canonical list of Cornell buildings +# NOTE: This list is not exhaustive. Add more buildings as needed... +CANONICAL_BUILDINGS = [ + "Akwe:kon", + "Alice Cook House", + "Baker Lab", + "Barton Hall", + "Becker House", + "Breazzano Center", + "Catherwood Library", + "Clark Hall", + "College of Veterinary Medicine", + "Court-Kay-Bauer Hall", + "Dickson", + "Ecology House", + "Flora Rose House", + "Ganedago", + "Hans Bethe House", + "Hollister Hall", + "Ives Hall", + "John Henrik Clarke Africana Library", + "Keeton House", + "Kroch Library", + "Latino Living Center", + "Law Library", + "Lincoln Hall", + "Mann Library", + "Martha Van Rensselaer Hall", + "Mary Donlon Hall", + "Math Library", + "Mews Hall", + "Milstein Hall", + "Morrison Hall", + "Myron Taylor", + "Olin Library", + "Phillips Hall", + "Plant Science", + "RPCC", + "Rand Hall", + "Rhodes Hall", + "Risley Hall", + "Rockefeller Lab", + "Ruth Bader Ginsburg Hall", + "Sage Hall", + "Schwartz Center", + "Sibley Hall", + "Statler Hall", + "Stimson", + "Tjaden Hall", + "Toni Morrison", + "Ujamaa", + "Upson Hall", + "Uris Library", + "Vet Library", + "Warren Hall", + "White Hall", + "Willard Student Center" +] + +# Regex helpers +HTML_TAG_RE = re.compile(r"<[^>]+>") +BRACKET_CONTENT_RE = re.compile(r"[\(\[\{].*?[\)\]\}]") +MULTI_SPACE_RE = re.compile(r"\s+") +TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$") + +# Used for stripping common label phrases from building names +LABEL_PHRASES_RE = re.compile( + r""" + \bresidents?\s*only\b | + \bstudents?\s*only\b | + \baa\s*&\s*p\b | + \baap\b + """, re.IGNORECASE | re.VERBOSE +) + +# Used to identify common variants of labels +LABEL_PATTERNS = { + # --- Access restrictions --- + # Residents Only (singular/plural + optional hyphen + any case) + "Residents Only": re.compile(r"\bresident[s]?[-\s]*only\b", re.IGNORECASE), + + # AA&P Students Only (accept AA&P or AAP; allow any junk in-between; optional hyphen) + "AA&P Students Only": re.compile( + r"\b(?:aa\s*&\s*p|aap)\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), + + # Landscape Architecture Students Only (allow arbitrary whitespace; optional hyphen) + "Landscape Architecture Students Only": re.compile( + r"\blandscape\s+architecture\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), + + # --- Printer capabilities --- + "Color, Scan, & Copy": re.compile( + r"\bcolor\s*[,/&]?\s*(?:scan\s*[,/&]?\s*)?(?:and\s*)?\s*&?\s*(?:copy|print|copying)\b", re.IGNORECASE + ), + "Black & White": re.compile( + r"\b(?:black\s*(?:and|&)\s*white|b\s*&\s*w)\b", re.IGNORECASE + ), + "Color": re.compile(r"\bcolor\b", re.IGNORECASE), + +} + +# Used for stripping residual trailing labels from descriptions +RESIDUAL_TRAILING_LABEL_RE = re.compile( + r"\b(?:resident|residents|student|students|staff|public)\b\s*$", + re.IGNORECASE +) + +def _norm(s): + """ + Unicode/HTML/whitespace normalization. + """ + if s is None: + return "" + s = unicodedata.normalize('NFKC', s) # Normalizes unicode text + s = HTML_TAG_RE.sub(" ", s) + s = s.replace("*", " ") + s = BRACKET_CONTENT_RE.sub(" ", s) + s = MULTI_SPACE_RE.sub(" ", s).strip() + return s + +def _strip_trailing_allcaps(s): + """ + Remove trailing ALL-CAPS qualifiers (e.g., RESIDENTS ONLY). + """ + return TRAILING_CAPS_RE.sub("", s).strip() + +def _pre_clean_for_match(s: str) -> str: + """ + Pre-clean a building name for matching against the canonical list. + """ + s = _norm(s) + s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc. + s = _strip_trailing_allcaps(s) + s = RESIDUAL_TRAILING_LABEL_RE.sub(" ", s) # <— removes "Resident", "Students", etc. + + s = re.sub(r"[^\w\s\-’']", " ", s) # punctuation noise + s = re.sub(r"\s+", " ", s).strip() + return s + +def _token_sort(s): + """ + Tokenize a string, sort the tokens, and re-join them. + """ + tokens = s.lower().split() + tokens.sort() + return " ".join(tokens) + +def map_building(name, threshold=87): + """ + Map a building name to a canonical building name using fuzzy matching. + """ + if not name: + return None, 0 + + query = _token_sort(_pre_clean_for_match(name)) + canon_token_list = [_token_sort(_pre_clean_for_match(c)) for c in CANONICAL_BUILDINGS] + + # Returns a list of the (top-1) closest match to the cleaned name + best = get_close_matches(query, canon_token_list, n=1) + + # If no matches (empty list), return the original name and 0 + if not best: + return name, 0 - # Locate the table - table = soup.find("table", {"id": "directoryTable"}) - rows = table.find("tbody").find_all("tr") + # Return the closest match and its similarity score + match = best[0] - # Extract data + # Calculate the similarity score of the match to the original name (for internal use, potential debugging purposes) + index = canon_token_list.index(match) + canon_raw = CANONICAL_BUILDINGS[index] + score = int(SequenceMatcher(None, query, match).ratio() * 100) + + # If the score is below the threshold, return the original name instead of the canonical name + return (canon_raw, score) if score >= threshold else (name, score) + +def map_labels(text): + """ + Extract label tokens from the description. + """ + if not text: + return text, [] + + cleaned = _norm(text) + found_labels = [] + + for canon, pattern in LABEL_PATTERNS.items(): + # Search for the pattern in the cleaned text + if pattern.search(cleaned): + found_labels.append(canon) + cleaned = pattern.sub("", cleaned, count=1).strip() + + # Collapse runs of punctuation-delimiters to a single space + cleaned = re.sub(r"\s*[,;/|&\-–—:]+\s*", " ", cleaned) + + # Remove any leftover leading delimiters/spaces (e.g., ", ", "- ") + cleaned = re.sub(r"^[\s,;/|&\-–—:]+", "", cleaned) + + # Remove standalone "Copy", "Print", or "Scan" at the start (leftover from partial label removal) + cleaned = re.sub(r"^(?:copy|print|scan)\s+", "", cleaned, flags=re.IGNORECASE) + + # Final whitespace cleanup + cleaned = re.sub(r"\s+", " ", cleaned).strip() + + return cleaned, sorted(set(found_labels)) + +def fetch_printers_json(): + """ + Fetch printer data in JSON format from the CU Print directory endpoint. + """ + resp = requests.get(URL, headers=HEADERS, timeout=20) + resp.raise_for_status() + return resp.json() + +def scrape_printers(): + """ + Scrape CU Print printer locations from the Cornell directory page. + """ + payload = fetch_printers_json() data = [] - for row in rows: - cols = row.find_all("td") - if len(cols) < 3: # Ensure row has enough columns + + # payload['rows'] is a list of lists, where each inner list represents a row of data + for row in payload['rows']: + if len(row) < 3: # Ensure row has enough columns + continue # Skipping row with insufficient columns + + # Each row is of the structure ["Building", "Equipment & Location", "Coordinates (Lat, Lng)"] + [raw_building, raw_location, raw_coordinates] = row + + # Map raw building name to canonical building name + building, _ = map_building(raw_building) + + # If we weren't able to map the building to a canonical building, skip this row + # NOTE: This should prevent us from getting "None" as the location, which was happening earlier + if building not in CANONICAL_BUILDINGS: continue + + # Map labels from description to canonical labels + labels = [] - location_name = cols[0].text.strip() - description = cols[1].text.strip() + _, building_labels = map_labels(raw_building) # Get labels from the building name (e.g., "Residents Only") + remainder, location_labels = map_labels(raw_location) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") + + # Deduplicate and sort labels + labels += building_labels + labels += location_labels + labels = sorted(set(labels)) - # Extract coordinates from the hyperlink tag inside - coordinates_link = cols[2].find("a") - coordinates_string = coordinates_link.text.strip() if coordinates_link else "" - coordinates = [float(x) for x in coordinates_string.split(', ')] + cleaned = re.sub(r"^[\s\-–—:/|]+", "", remainder).strip() # Remove leftover delimiters at the start (like " - ", " / ", ": ", etc.) + description = cleaned # Final cleaned description text (with labels removed) — essentially, remainder of the location description + # Splits coordinates string into a list of floats + coordinates = [float(x) for x in raw_coordinates.split(', ')] data.append({ - "Location": location_name, + "Location": building, "Description": description, - "Coordinates": coordinates + "Coordinates": coordinates, + "Labels": labels }) - return data \ No newline at end of file + + return data + +if __name__ == "__main__": + results = scrape_printers() + print(f"Scraped {len(results)} printers.\n") + + # Print a sample of the data + for row in results: + if row['Location'] == 'Vet Library': + print(row['Description'], row['Labels']) \ No newline at end of file diff --git a/src/data/scripts/populate_db.py b/src/data/scripts/populate_db.py index fa6a23f4..30ddc62f 100644 --- a/src/data/scripts/populate_db.py +++ b/src/data/scripts/populate_db.py @@ -18,7 +18,7 @@ def populate_db(): # Insert printers printers = scrape_printers() for printer in printers: - insert_printer(printer['Location'], printer['Description'], printer['Coordinates'][0], printer['Coordinates'][1]) + insert_printer(printer['Location'], printer['Description'], printer['Labels'], printer['Coordinates'][0], printer['Coordinates'][1]) if __name__ == "__main__": populate_db() \ No newline at end of file diff --git a/src/data/scripts/run-migrations.js b/src/data/scripts/run-migrations.js new file mode 100644 index 00000000..5e499fc3 --- /dev/null +++ b/src/data/scripts/run-migrations.js @@ -0,0 +1,112 @@ +// Imports necessary for data migrations +import fs from 'fs' // Node's built-in file system module, which lets us read from disk +import path from 'path'; // Safer way to express file paths/path joining +import crypto from 'crypto'; +import Database from 'better-sqlite3'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// || path.join(__dirname, "../transit.db") +const DB_PATH = process.env.DB_PATH; // Finds db file from current file's directory +const MIGRATIONS_DIR = path.join(__dirname, "../migrations"); + +/** + * Hashes a string using SHA-256 + * + * We use this to store the checksum of the migration file in the database. + * This allows us to track which migrations have been applied, as well as if a migration file has been modified since it was last applied. + * + * @param {string} s - The string to hash + * @returns {string} - The SHA-256 hash of the string + */ +function sha256(s) { + return crypto.createHash('sha256').update(s, 'utf8').digest('hex'); +} + +/** + * Runs the migrations + * + * This function reads all the migration files in the migrations directory, hashes them, and stores the checksum in the database. + * It then executes the migrations in the order of the files. + * + * @returns {void} + * @throws {Error} - If the migrations fail + */ +function runMigration() { + // Open the database using the better-sqlite3 library + const db = new Database(DB_PATH); + + // Set defaults for migrations + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma('foreign_keys = ON'); + + // Create the schema_migrations table if it doesn't exist for tracking migrations applied to the database + db.exec(` + CREATE TABLE IF NOT EXISTS schema_migrations ( + id INTEGER PRIMARY KEY, + filename TEXT NOT NULL UNIQUE, + checksum TEXT NOT NULL, + applied_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + `); + + // Get the list of migrations that have already been applied to the database + const applied = new Set( + db.prepare('SELECT filename FROM schema_migrations').all().map(record => record.filename) + ); + + // Get the list of migration files in the migrations directory (keeping only .sql files and sorting them chronologically) + const files = fs.readdirSync(MIGRATIONS_DIR).filter(f => f.endsWith('.sql')).sort(); + + // Prepare the statement to insert a new migration into the schema_migrations table + const insertMig = db.prepare(` + INSERT INTO schema_migrations (filename, checksum) VALUES (?,?) + `); + + // Define a transaction to execute the migrations + const transaction = db.transaction(() => { + for (const file of files) { + // Skip if the migration has already been applied + if (applied.has(file)) { + continue; + } + + const full = path.join(MIGRATIONS_DIR, file); + const sql = fs.readFileSync(full, 'utf8').trim(); + if (!sql) { + continue; + } + + // Defensive: re-enable FKs inside each run (is already done in the migrations, but just in case) + db.exec('PRAGMA foreign_keys = ON;'); + + // Execute SQL commands in the migration file + db.exec(sql); + + // Records migration as applied to the database via its check + insertMig.run(file, sha256(sql)); + console.log(`Applied ${file}`); + } + }); + + try { + transaction(); + console.log('All migrations applied'); + } catch (e) { + console.error("Migration failed", e); + } finally { + db.close(); + } +} + +export function runMigrations() { + runMigration(); +} + +import { pathToFileURL } from 'url'; +if (import.meta.url === pathToFileURL(process.argv[1]).href) { + runMigrations(); + } \ No newline at end of file diff --git a/src/swagger.json b/src/swagger.json index ff9b0afe..fc7f7348 100644 --- a/src/swagger.json +++ b/src/swagger.json @@ -66,7 +66,7 @@ ], "responses": { "200": { - "description": "{\"success\": true, \"data\": [{\"id\": 1, \"location\": \"Akwe:kon\", \"description\": \"Color - Room 115\", \"latitude\": 42.4563, \"longitude\": -76.4806}]}", + "description": "{\"success\": true, \"data\": [{\"id\": 1, \"location\": \"Akwe:kon\", \"description\": \"Room 115\", \"latitude\": 42.4563, \"longitude\": -76.4806, \"labels\": [\"Color\"]}]}", "schema": { "$ref": "#/components/schemas/BusStop" } diff --git a/src/utils/EcosystemUtils.js b/src/utils/EcosystemUtils.js index 5aadd2b8..a5e979ab 100644 --- a/src/utils/EcosystemUtils.js +++ b/src/utils/EcosystemUtils.js @@ -45,7 +45,7 @@ function fetchAllPrinters() { }); // Fetch printers - db.all("SELECT * FROM printers", (err, rows) => { + db.all("SELECT p.id, p.location, p.description, p.latitude, p.longitude, COALESCE(GROUP_CONCAT(DISTINCT l.label, ', '), '') AS labels FROM printers p LEFT JOIN printer_labels pl ON p.id = pl.printer_id LEFT JOIN labels l ON pl.label_id = l.id GROUP BY p.id", (err, rows) => { if (err) { console.error(err.message); return reject(err);