diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5b928bb7..00000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 41635879..a8fbbdc4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ build/ logs/ node_modules/ +__pycache__/ # Specific Files config.json diff --git a/package-lock.json b/package-lock.json index 826744f3..f6b08421 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", @@ -791,6 +792,20 @@ "tweetnacl": "^0.14.3" } }, + "node_modules/better-sqlite3": { + "version": "12.4.1", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.4.1.tgz", + "integrity": "sha512-3yVdyZhklTiNrtg+4WqHpJpFDd+WHTg2oM7UcR80GqL05AOV0xEJzc6qNvFYoEtE+hRp1n9MpN6/+4yhlGkDXQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + }, + "engines": { + "node": "20.x || 22.x || 23.x || 24.x" + } + }, "node_modules/bignumber.js": { "version": "9.1.2", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.1.2.tgz", diff --git a/package.json b/package.json index 057904ac..b4d1f5c2 100644 --- a/package.json +++ b/package.json @@ -6,12 +6,15 @@ "type": "module", "scripts": { "start:dev": "nodemon --ignore src/data/notifRequests.json src/index.js", - "start": "node src/index.js" + "start": "node src/index.js", + "migrate": "node src/data/scripts/run-migrations.js", + "populate:db": "npm run migrate && python3 src/data/scripts/populate_db.py" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", diff --git a/src/.DS_Store b/src/.DS_Store index a39c96c4..807e9445 100644 Binary files a/src/.DS_Store and b/src/.DS_Store differ diff --git a/src/data/db/database.py b/src/data/db/database.py index efdf541e..9b857b2a 100644 --- a/src/data/db/database.py +++ b/src/data/db/database.py @@ -32,18 +32,54 @@ def insert_library(location, address, latitude, longitude): conn.close() -def insert_printer(location, description, latitude, longitude): +def insert_printer(location, description, labels, latitude, longitude): """Insert a printer into the database.""" conn = get_db_connection() cursor = conn.cursor() + # We remove the "OR IGNORE" because we acknoledge that several printers may have the same location and description (i.e., same building and room), so we rely on the unique printer_id to identify the printer cursor.execute( """ - INSERT OR IGNORE INTO printers (location, description, latitude, longitude) + INSERT INTO printers (location, description, latitude, longitude) VALUES (?, ?, ?, ?) """, (location, description, latitude, longitude), ) + # To get the printer_id, we do NOT rely on the location/description/coordinates, but rather on the printer_id that was just inserted (lastrowid), as several printers may have the same location and description (i.e., same building and room) + printer_id = cursor.lastrowid + + # Insert labels into the labels table and get their IDs + label_ids = [] + for label in labels: + cursor.execute( + """ + INSERT OR IGNORE INTO labels (label) + VALUES (?) + """, + (label,), + ) + cursor.execute( + """ + SELECT id FROM labels WHERE label = ? + """, + (label,), + ) + result = cursor.fetchone() + if result is None: + raise ValueError(f"Failed to find label: {label}") + label_id = result[0] + label_ids.append(label_id) + + # Insert into junction table + for label_id in label_ids: + cursor.execute( + """ + INSERT OR IGNORE INTO printer_labels (printer_id, label_id) + VALUES (?, ?) + """, + (printer_id, label_id), + ) + conn.commit() conn.close() diff --git a/src/data/db/models.py b/src/data/db/models.py index 7634fd0e..8183be91 100644 --- a/src/data/db/models.py +++ b/src/data/db/models.py @@ -31,7 +31,7 @@ def create_tables(): """ CREATE TABLE IF NOT EXISTS printers ( id INTEGER PRIMARY KEY AUTOINCREMENT, - location TEXT UNIQUE, + location TEXT, description TEXT, latitude REAL, longitude REAL diff --git a/src/data/migrations/2025117_1854_create_labels.sql b/src/data/migrations/2025117_1854_create_labels.sql new file mode 100644 index 00000000..3884e988 --- /dev/null +++ b/src/data/migrations/2025117_1854_create_labels.sql @@ -0,0 +1,6 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS labels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + label TEXT UNIQUE NOT NULL +); \ No newline at end of file diff --git a/src/data/migrations/2025117_1859_create_printer_labels.sql b/src/data/migrations/2025117_1859_create_printer_labels.sql new file mode 100644 index 00000000..73fd9c06 --- /dev/null +++ b/src/data/migrations/2025117_1859_create_printer_labels.sql @@ -0,0 +1,9 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS printer_labels ( + printer_id INTEGER NOT NULL, + label_id INTEGER NOT NULL, + PRIMARY KEY (printer_id, label_id), + FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE, + FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE +); \ No newline at end of file diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index e972046f..ea40cd69 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -1,37 +1,292 @@ import requests -from bs4 import BeautifulSoup +from difflib import get_close_matches # For data scraping +from difflib import SequenceMatcher +import re # For using regex +import unicodedata # Handles text encoding at Unicode level # URL of the CU Print directory page -URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL +# URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL -def scrape_printers(): - # Send a GET request to fetch the HTML content - response = requests.get(URL) - soup = BeautifulSoup(response.text, 'html.parser') +URL = 'https://www.cornell.edu/about/maps/directory/text-data.cfm?layer=CUPrint&caption=%20CU%20Print%20Printers' + +# HTTP headers to mimic a real browser request +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36", + "Referer": 'https://www.cornell.edu/about/maps/directory/', + "X-Requested-With": 'XMLHttpRequest', + "Accept": 'application/json, text/javascript, */*', +} + +# Canonical list of Cornell buildings +# NOTE: This list is not exhaustive. Add more buildings as needed... +CANONICAL_BUILDINGS = [ + "Akwe:kon", + "Alice Cook House", + "Baker Lab", + "Barton Hall", + "Becker House", + "Breazzano Center", + "Catherwood Library", + "Clark Hall", + "College of Veterinary Medicine", + "Court-Kay-Bauer Hall", + "Dickson", + "Ecology House", + "Flora Rose House", + "Ganedago", + "Hans Bethe House", + "Hollister Hall", + "Ives Hall", + "John Henrik Clarke Africana Library", + "Keeton House", + "Kroch Library", + "Latino Living Center", + "Law Library", + "Lincoln Hall", + "Mann Library", + "Martha Van Rensselaer Hall", + "Mary Donlon Hall", + "Math Library", + "Mews Hall", + "Milstein Hall", + "Morrison Hall", + "Myron Taylor", + "Olin Library", + "Phillips Hall", + "Plant Science", + "RPCC", + "Rand Hall", + "Rhodes Hall", + "Risley Hall", + "Rockefeller Lab", + "Ruth Bader Ginsburg Hall", + "Sage Hall", + "Schwartz Center", + "Sibley Hall", + "Statler Hall", + "Stimson", + "Tjaden Hall", + "Toni Morrison", + "Ujamaa", + "Upson Hall", + "Uris Library", + "Vet Library", + "Warren Hall", + "White Hall", + "Willard Student Center" +] + +# Regex helpers +HTML_TAG_RE = re.compile(r"<[^>]+>") +BRACKET_CONTENT_RE = re.compile(r"[\(\[\{].*?[\)\]\}]") +MULTI_SPACE_RE = re.compile(r"\s+") +TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$") + +# Used for stripping common label phrases from building names +LABEL_PHRASES_RE = re.compile( + r""" + \bresidents?\s*only\b | + \bstudents?\s*only\b | + \baa\s*&\s*p\b | + \baap\b + """, re.IGNORECASE | re.VERBOSE +) + +# Used to identify common variants of labels +LABEL_PATTERNS = { + # --- Access restrictions --- + # Residents Only (singular/plural + optional hyphen + any case) + "Residents Only": re.compile(r"\bresident[s]?[-\s]*only\b", re.IGNORECASE), + + # AA&P Students Only (accept AA&P or AAP; allow any junk in-between; optional hyphen) + "AA&P Students Only": re.compile( + r"\b(?:aa\s*&\s*p|aap)\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), + + # Landscape Architecture Students Only (allow arbitrary whitespace; optional hyphen) + "Landscape Architecture Students Only": re.compile( + r"\blandscape\s+architecture\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), + + # --- Printer capabilities --- + "Color, Scan, & Copy": re.compile( + r"\bcolor\s*[,/&]?\s*(?:scan\s*[,/&]?\s*)?(?:and\s*)?\s*&?\s*(?:copy|print|copying)\b", re.IGNORECASE + ), + "Black & White": re.compile( + r"\b(?:black\s*(?:and|&)\s*white|b\s*&\s*w)\b", re.IGNORECASE + ), + "Color": re.compile(r"\bcolor\b", re.IGNORECASE), + +} + +# Used for stripping residual trailing labels from descriptions +RESIDUAL_TRAILING_LABEL_RE = re.compile( + r"\b(?:resident|residents|student|students|staff|public)\b\s*$", + re.IGNORECASE +) + +def _norm(s): + """ + Unicode/HTML/whitespace normalization. + """ + if s is None: + return "" + s = unicodedata.normalize('NFKC', s) # Normalizes unicode text + s = HTML_TAG_RE.sub(" ", s) + s = s.replace("*", " ") + s = BRACKET_CONTENT_RE.sub(" ", s) + s = MULTI_SPACE_RE.sub(" ", s).strip() + return s + +def _strip_trailing_allcaps(s): + """ + Remove trailing ALL-CAPS qualifiers (e.g., RESIDENTS ONLY). + """ + return TRAILING_CAPS_RE.sub("", s).strip() + +def _pre_clean_for_match(s: str) -> str: + """ + Pre-clean a building name for matching against the canonical list. + """ + s = _norm(s) + s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc. + s = _strip_trailing_allcaps(s) + s = RESIDUAL_TRAILING_LABEL_RE.sub(" ", s) # <— removes "Resident", "Students", etc. + + s = re.sub(r"[^\w\s\-’']", " ", s) # punctuation noise + s = re.sub(r"\s+", " ", s).strip() + return s + +def _token_sort(s): + """ + Tokenize a string, sort the tokens, and re-join them. + """ + tokens = s.lower().split() + tokens.sort() + return " ".join(tokens) + +def map_building(name, threshold=87): + """ + Map a building name to a canonical building name using fuzzy matching. + """ + if not name: + return None, 0 + + query = _token_sort(_pre_clean_for_match(name)) + canon_token_list = [_token_sort(_pre_clean_for_match(c)) for c in CANONICAL_BUILDINGS] + + # Returns a list of the (top-1) closest match to the cleaned name + best = get_close_matches(query, canon_token_list, n=1) + + # If no matches (empty list), return the original name and 0 + if not best: + return name, 0 - # Locate the table - table = soup.find("table", {"id": "directoryTable"}) - rows = table.find("tbody").find_all("tr") + # Return the closest match and its similarity score + match = best[0] - # Extract data + # Calculate the similarity score of the match to the original name (for internal use, potential debugging purposes) + index = canon_token_list.index(match) + canon_raw = CANONICAL_BUILDINGS[index] + score = int(SequenceMatcher(None, query, match).ratio() * 100) + + # If the score is below the threshold, return the original name instead of the canonical name + return (canon_raw, score) if score >= threshold else (name, score) + +def map_labels(text): + """ + Extract label tokens from the description. + """ + if not text: + return text, [] + + cleaned = _norm(text) + found_labels = [] + + for canon, pattern in LABEL_PATTERNS.items(): + # Search for the pattern in the cleaned text + if pattern.search(cleaned): + found_labels.append(canon) + cleaned = pattern.sub("", cleaned, count=1).strip() + + # Collapse runs of punctuation-delimiters to a single space + cleaned = re.sub(r"\s*[,;/|&\-–—:]+\s*", " ", cleaned) + + # Remove any leftover leading delimiters/spaces (e.g., ", ", "- ") + cleaned = re.sub(r"^[\s,;/|&\-–—:]+", "", cleaned) + + # Remove standalone "Copy", "Print", or "Scan" at the start (leftover from partial label removal) + cleaned = re.sub(r"^(?:copy|print|scan)\s+", "", cleaned, flags=re.IGNORECASE) + + # Final whitespace cleanup + cleaned = re.sub(r"\s+", " ", cleaned).strip() + + return cleaned, sorted(set(found_labels)) + +def fetch_printers_json(): + """ + Fetch printer data in JSON format from the CU Print directory endpoint. + """ + resp = requests.get(URL, headers=HEADERS, timeout=20) + resp.raise_for_status() + return resp.json() + +def scrape_printers(): + """ + Scrape CU Print printer locations from the Cornell directory page. + """ + payload = fetch_printers_json() data = [] - for row in rows: - cols = row.find_all("td") - if len(cols) < 3: # Ensure row has enough columns + + # payload['rows'] is a list of lists, where each inner list represents a row of data + for row in payload['rows']: + if len(row) < 3: # Ensure row has enough columns + continue # Skipping row with insufficient columns + + # Each row is of the structure ["Building", "Equipment & Location", "Coordinates (Lat, Lng)"] + [raw_building, raw_location, raw_coordinates] = row + + # Map raw building name to canonical building name + building, _ = map_building(raw_building) + + # If we weren't able to map the building to a canonical building, skip this row + # NOTE: This should prevent us from getting "None" as the location, which was happening earlier + if building not in CANONICAL_BUILDINGS: continue + + # Map labels from description to canonical labels + labels = [] - location_name = cols[0].text.strip() - description = cols[1].text.strip() + _, building_labels = map_labels(raw_building) # Get labels from the building name (e.g., "Residents Only") + remainder, location_labels = map_labels(raw_location) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") + + # Deduplicate and sort labels + labels += building_labels + labels += location_labels + labels = sorted(set(labels)) - # Extract coordinates from the hyperlink tag inside