From 0c3aebb1134fd076e8213b6c72e1bc868fb7bd93 Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Tue, 2 Sep 2025 04:35:40 +0800 Subject: [PATCH 01/20] feat: crawl_and_import_data --- apps/spider/crawlers/orc.py | 690 ++++++++++++++++++++++++++++++------ 1 file changed, 582 insertions(+), 108 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index a9bc782..9f7ab12 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -1,139 +1,596 @@ import re from urllib.parse import urljoin -from apps.spider.utils import retrieve_soup # parse_number_and_subnumber, +import json +import requests +import time +from collections import defaultdict + from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM -BASE_URL = "https://www.ji.sjtu.edu.cn/" -ORC_BASE_URL = urljoin(BASE_URL, "/academics/courses/courses-by-number/") -# ORC_UNDERGRAD_SUFFIX = "Departments-Programs-Undergraduate" -# ORC_GRADUATE_SUFFIX = "Departments-Programs-Graduate" -COURSE_DETAIL_URL_PREFIX = ( - "https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id=" -) -UNDERGRAD_URL = ORC_BASE_URL +# API端点配置 +BASE_URL = "https://coursesel.umji.sjtu.edu.cn" +COURSE_DETAIL_URL_PREFIX = urljoin(BASE_URL, "/course/") + +# 兼容性配置(保留旧版本接口) +ORC_BASE_URL = BASE_URL +UNDERGRAD_URL = BASE_URL + +# 正则表达式模式 INSTRUCTOR_TERM_REGEX = re.compile(r"^(?P\w*)\s?(\((?P\w*)\))?") -# SUPPLEMENT_URL = "http://dartmouth.smartcatalogiq.com/en/2016s/Supplement/Courses" - -# COURSE_HEADING_CORRECTIONS = { -# "COLT": {"7 First Year Seminars": "COLT 7 First Year Seminars"}, -# "GRK": {"GRK 1.02-3.02 Intensive Greek": "GRK 1.02 Intensive Greek"}, -# "INTS": { -# "INTS INTS 17.04 Migration Stories": "INTS 17.04 Migration Stories", -# }, -# "MALS": { -# "MALS MALS 368 Seeing and Feeling in Early Modern Europe": ( -# "MALS 368 Seeing and Feeling in Early Modern Europe" -# ), -# }, -# "PSYC": {"$name": None}, -# "QBS": { -# "Quantitative Biomedical Sciences 132-2 Molecular Markers in Human " -# "Health Studies Lab": ( -# "QBS 132.02 Molecular Markers in Human Health Studies Lab" -# ), -# }, -# } +class CourseSelCrawler: + """ + JI SJTU 课程选择系统爬虫模块 -def crawl_program_urls(): - for orc_url in [UNDERGRAD_URL]: - program_urls = _get_department_urls_from_url(orc_url) - return program_urls + 该模块负责从上海交通大学密西根学院的课程选择系统中爬取课程信息, + 包括课程基本信息、先修课程要求、教师信息等。 + 数据源说明: + 1. 选课任务API: 提供当前学期的课程开设信息、教师信息等 + 2. 课程目录API: 提供课程详细描述、额外的教师信息等 + 3. 先修课程API: 提供完整的先修课程逻辑关系 + """ -def _get_department_urls_from_url(url): - soup = retrieve_soup(url) - linked_urls = [urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True)] - return set( - linked_url for linked_url in linked_urls if _is_department_url(linked_url) - ) + def __init__(self): + """ + 初始化爬虫实例 + + 设置HTTP会话、cookies和headers,确保能够正常访问课程选择系统的API。 + """ + self.session = requests.Session() + # 必要的认证cookies(从浏览器中获取的有效会话ID) + cookies = { + "JSESSIONID": "your_own_cookie", + } + self.session.cookies.update(cookies) -def _is_department_url(candidate_url): - return candidate_url.startswith(COURSE_DETAIL_URL_PREFIX) + # 模拟浏览器请求的headers + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "application/json, text/javascript, */*; q=0.01", + "Referer": "https://coursesel.umji.sjtu.edu.cn/", + "X-Requested-With": "XMLHttpRequest", # 标识AJAX请求 + } + self.session.headers.update(headers) + def get_all_courses(self): + """ + 获取所有课程数据的主入口方法 + + 该方法协调多个数据源,获取完整的课程信息: + 1. 选课任务数据(包含当前学期开设信息) + 2. 课程目录数据(包含课程描述等详细信息) + 3. 先修课程数据(包含先修要求的逻辑关系) + + Returns: + list: 包含所有课程信息的字典列表,每个字典包含: + - course_code: 课程代码 + - course_title: 课程标题 + - department: 院系代码 + - number: 课程编号 + - course_credits: 学分数 + - pre_requisites: 先修课程要求(字符串形式的逻辑表达式) + - description: 课程描述 + - instructors: 教师列表 + - url: 课程详情页面URL + """ + # 获取三个主要数据源 + courses_data = self._get_lesson_tasks() + course_details = self._get_course_catalog() + prerequisites = self._get_prerequisites() -def _crawl_course_data(course_url): - soup = retrieve_soup(course_url) - course_heading = soup.find("h2").get_text() - if course_heading: - split_course_heading = course_heading.split(" – ") - children = list(soup.find_all(class_="et_pb_text_inner")[3].children) - - course_code = split_course_heading[0] - department = re.findall(r"^([A-Z]{2,4})\d+", course_code)[0] - number = re.findall(r"^[A-Z]{2,4}(\d{3})", course_code)[0] - course_title = split_course_heading[1] - - course_credits = 0 - pre_requisites = "" - description = "" - course_topics = [] - instructors = [] - - for i, child in enumerate(children): - text = child.get_text(strip=True) if hasattr(child, "get_text") else "" - if "Credits:" in text: - course_credits = int(re.findall(r"\d+", text)[0]) - elif "Pre-requisites:" in text: - pre_requisites = extract_prerequisites(text) - elif "Description:" in text: - description = ( - children[i + 2].get_text(strip=True) - if i + 2 < len(children) - else "" - ) - if description == "\n" or "Course Topics" in description: - description = "" - elif "Course Topics:" in text: - course_topics = ( - [li.get_text(strip=True) for li in children[i + 2].find_all("li")] - if i + 2 < len(children) - else [] + # 整合所有数据源的信息 + return self._integrate_course_data(courses_data, course_details, prerequisites) + + def _get_lesson_tasks(self): + """ + 获取选课任务数据(第一个数据源) + + 从选课系统的主API获取当前学期的所有选课任务信息。 + 这个API返回的数据包含: + - 课程基本信息(课程代码、名称、学分等) + - 教师信息 + - 开课时间和地点 + - 选课限制等信息 + + Returns: + list: 选课任务列表,每个任务包含课程的基本信息 + """ + url = f"{BASE_URL}/tpm/findLessonTasksPreview_ElectTurn.action" + + # API请求参数,控制返回的数据类型 + json_params = { + "isToTheTime": True, # 是否到了选课时间 + "electTurnId": "93B7BAF9-7E8B-4D32-BCC8-DE49B320AB0A", # 选课轮次ID + "loadCourseGroup": True, # 加载课程组信息 + "loadElectTurn": True, # 加载选课轮次信息 + "loadCourseType": True, # 加载课程类型信息 + "loadCourseTypeCredit": True, # 加载课程类型学分信息 + "loadElectTurnResult": True, # 加载选课结果 + "loadStudentLessonTask": True, # 加载学生选课任务 + "loadPrerequisiteCourse": True, # 加载先修课程信息 + "loadLessonCalendarWeek": True, # 加载课程日历周信息 + "loadLessonCalendarConflict": True, # 加载时间冲突检查 + "loadTermCredit": True, # 加载学期学分信息 + "loadLessonTask": True, # 加载课程任务详情 + "loadDropApprove": True, # 加载退课审批信息 + "loadElectApprove": True, # 加载选课审批信息 + } + + # 构建GET请求URL(该API要求JSON参数作为查询字符串) + import urllib.parse + json_string = json.dumps(json_params, separators=(",", ":")) + encoded_json = urllib.parse.quote(json_string) + full_url = f"{url}?jsonString={encoded_json}" + + try: + response = self.session.get(full_url, timeout=30) + response.raise_for_status() + data = response.json() + + # 检查API响应格式并提取课程任务数据 + if data.get("success") and "data" in data and "lessonTasks" in data["data"]: + return data["data"]["lessonTasks"] + return [] + except Exception: + # 发生错误时返回空列表,不中断整个爬取流程 + return [] + + def _get_course_catalog(self): + """ + 获取课程目录数据(第二个数据源) + + 从课程目录API获取课程的详细描述信息。 + 这个API主要提供: + - 课程的详细描述 + - 课程的英文名称 + - 额外的教师信息 + - 其他补充信息 + + Returns: + dict: 以courseId为键的课程详情字典 + """ + url = f"{BASE_URL}/jdji/tpm/findOwnCollegeCourse_JiCourse.action" + + try: + response = self.session.post(url, json={}) + response.raise_for_status() + data = response.json() + + if data.get("success") and "data" in data and "courses" in data["data"]: + # 将课程列表转换为courseId索引的字典以便快速查找 + return { + course.get("courseId"): course for course in data["data"]["courses"] + } + return {} + except Exception: + # 出错时返回空字典,不影响主流程 + return {} + + def _get_prerequisites(self): + """ + 获取先修课程数据(第三个数据源) + + 从先修课程API获取所有课程的先修要求。 + 这个API返回的数据包含: + - 课程ID和对应的先修课程ID + - 先修课程的逻辑关系(AND、OR等) + - 先修要求的详细描述(prerequisiteRuleDesc) + + Returns: + dict: 以courseId为键的先修课程列表字典 + 每个值是一个列表,包含该课程的所有先修要求项 + """ + url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" + + try: + # 添加时间戳参数防止缓存 + response = self.session.post(url, params={"_t": int(time.time() * 1000)}) + response.raise_for_status() + data = response.json() + + # 调试信息:记录API响应状态 + print(f"[DEBUG] Prerequisites API response: success={data.get('success')}") + print( + f"[DEBUG] Data keys: {list(data.keys()) if isinstance(data, dict) else 'Not dict'}" + ) + + if data.get("success") and "data" in data: + raw_prereqs = data["data"] + print(f"[DEBUG] Raw prerequisites data: {len(raw_prereqs)} items") + + # 显示第一个先修要求的结构作为调试信息 + if raw_prereqs and len(raw_prereqs) > 0: + print(f"[DEBUG] First prerequisite item: {raw_prereqs[0]}") + + # 按courseId分组先修课程数据,这样可以快速查找每个课程的所有先修要求 + prereqs = defaultdict(list) + for item in raw_prereqs: + course_id = item.get("courseId") + if course_id: + prereqs[course_id].append(item) + + print( + f"[DEBUG] Grouped prerequisites: {len(prereqs)} course IDs have prereqs" ) - elif "Instructors:" in text: - instructors_text = ( - children[i + 2].get_text(strip=True) - if i + 2 < len(children) - else "" + return prereqs + else: + print("[DEBUG] Prerequisites API failed or no data") + return {} + except Exception as e: + print(f"[DEBUG] Prerequisites API error: {str(e)}") + return {} + + def _integrate_course_data(self, courses_data, course_details, prerequisites): + """ + 整合多个数据源的课程信息 + + 将三个不同API获取的数据进行整合,生成完整的课程信息: + 1. courses_data: 选课任务数据(主要数据源) + 2. course_details: 课程目录数据(补充描述等信息) + 3. prerequisites: 先修课程数据(先修要求) + + 数据整合策略: + - 以课程代码为主键进行分组 + - 合并同一课程的不同section信息 + - 优先使用更完整的数据字段 + - 为每个课程匹配对应的先修要求 + + Args: + courses_data (list): 选课任务数据列表 + course_details (dict): 课程详情字典,以courseId为键 + prerequisites (dict): 先修课程字典,以courseId为键 + + Returns: + list: 整合后的课程信息列表 + """ + print( + f"[DEBUG] Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups" + ) + + # 按课程代码分组:同一门课程可能有多个section + courses_by_code = defaultdict(list) + for course in courses_data: + course_code = course.get("courseCode") + if course_code: + courses_by_code[course_code].append(course) + + integrated_courses = [] + courses_with_prereqs = 0 + + # 遍历每个课程代码,整合数据 + for course_code, course_list in courses_by_code.items(): + # 合并同课程的不同section(主要是合并教师信息) + merged = self._merge_course_sections(course_list) + if not merged: + continue + + # 获取课程ID并查找对应的详细信息和先修要求 + course_id = merged.get("courseId") + catalog_info = course_details.get(course_id, {}) + prereq_info = prerequisites.get(course_id, []) + + # 统计有先修要求的课程数量(用于调试) + if prereq_info: + courses_with_prereqs += 1 + print( + f"[DEBUG] Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" ) - instructors = [ - name.strip() for name in instructors_text.split(";") if name.strip() - ] - result = { + # 构建最终的课程数据记录 + course_data = self._build_course_record( + course_code, merged, catalog_info, prereq_info + ) + + if course_data: + integrated_courses.append(course_data) + + print( + f"[DEBUG] Integration complete: {courses_with_prereqs} courses have prerequisites" + ) + return integrated_courses + + def _merge_course_sections(self, course_list): + """ + 合并同一课程的不同section信息 + + 同一门课程可能有多个section(如不同时间的讲座、实验课等), + 需要将它们的信息合并,特别是教师信息。 + + Args: + course_list (list): 同一课程的section列表 + + Returns: + dict: 合并后的课程信息,包含所有section的教师信息 + """ + if not course_list: + return {} + + # 以第一个section为基础进行合并 + merged = course_list[0].copy() + all_instructors = set() + + # 收集所有section的教师信息 + for course in course_list: + teachers = course.get("lessonTaskTeam", "") + if teachers: + # 处理多种分隔符(中英文逗号、分号等) + for teacher in re.split(r"[,;,;、]", teachers): + if teacher.strip(): + all_instructors.add(teacher.strip()) + + # 将所有教师信息存储到合并后的课程数据中 + merged["all_instructors"] = list(all_instructors) + return merged + + def _build_course_record(self, course_code, main_data, catalog_data, prereq_data): + """ + 构建标准格式的课程记录 + + 将来自不同数据源的课程信息整合成统一的格式。 + 数据优先级:main_data > catalog_data(优先使用主数据源) + + Args: + course_code (str): 课程代码(如 "ECE2150J") + main_data (dict): 选课任务数据(主数据源) + catalog_data (dict): 课程目录数据(补充数据源) + prereq_data (list): 先修课程数据列表 + + Returns: + dict: 标准化的课程记录,如果数据无效则返回None + """ + # 1. 提取并验证课程标题 + course_title = self._extract_course_title(main_data, catalog_data) + if not course_title: + return None + + # 2. 解析课程代码中的院系和编号信息 + department, number = self._parse_course_code(course_code) + + # 3. 提取学分信息 + course_credits = self._extract_course_credits(main_data, catalog_data) + + # 4. 构建先修课程字符串 + prerequisites = self._build_prerequisites_string(course_code, prereq_data) + + # 5. 提取课程描述 + description = self._extract_description(main_data, catalog_data) + + # 6. 整合教师信息 + instructors = self._extract_instructors(main_data, catalog_data) + + # 7. 构建课程URL + course_url = self._build_course_url(main_data) + + return { "course_code": course_code, "course_title": course_title, "department": department, "number": number, "course_credits": course_credits, - "pre_requisites": pre_requisites, + "pre_requisites": prerequisites, "description": description, - "course_topics": course_topics, + "course_topics": [], # 暂时为空,保持接口一致 "instructors": instructors, "url": course_url, } - return result - # return { - # "course_code": "QWER1234J", - # "course_title": "Test Course", - # "department": "QWER", - # "number": 1234, - # "course_credits": 4, - # "pre_requisites": None, - # "description": "This is a test course", - # "course_topics": ["Test Topic"], - # "instructors": ["Test Instructor"], - # "url": course_url, - # } + + def _extract_course_title(self, main_data, catalog_data): + """提取课程标题(优先使用英文名称)""" + return ( + main_data.get("courseNameEn", "") + or main_data.get("courseName", "") + or catalog_data.get("courseNameEn", "") + or catalog_data.get("courseName", "") + ).strip() + + def _parse_course_code(self, course_code): + """从课程代码中解析院系和编号""" + department = "" + number = 0 + + if course_code: + # 提取院系代码(字母部分) + dept_match = re.match(r"^([A-Z]+)", course_code) + if dept_match: + department = dept_match.group(1) + + # 提取课程编号(数字部分) + num_match = re.search(r"(\d+)", course_code) + if num_match: + number = int(num_match.group(1)) + + return department, number + + def _extract_course_credits(self, main_data, catalog_data): + """提取课程学分信息""" + course_credits = main_data.get("totalCredit", 0) or catalog_data.get("credit", 0) + + if isinstance(course_credits, str): + try: + course_credits = int(float(course_credits)) + except (ValueError, TypeError): + course_credits = 0 + + return course_credits + + def _build_prerequisites_string(self, course_code, prereq_data): + """构建先修课程字符串""" + if not prereq_data: + return "" + + print( + f"[DEBUG] Building prerequisites for {course_code}, prereq_data has {len(prereq_data)} items" + ) + + prereq_codes = [] + for item in prereq_data: + rule_desc = item.get("prerequisiteRuleDesc", "") + print(f"[DEBUG] Processing prerequisite rule: {rule_desc}") + + if rule_desc: + # 使用完整的规则描述 + prereq_codes.append(rule_desc) + + if prereq_codes: + prerequisites = " || ".join(prereq_codes) + print(f"[DEBUG] Final prerequisites for {course_code}: {prerequisites}") + return prerequisites + + return "" + + def _extract_description(self, main_data, catalog_data): + """提取课程描述信息""" + return ( + main_data.get("description", "") + or catalog_data.get("description", "") + or main_data.get("memo", "") + or catalog_data.get("memo", "") + ).strip() + + def _extract_instructors(self, main_data, catalog_data): + """整合教师信息""" + instructors = main_data.get("all_instructors", []) + teacher_name = catalog_data.get("teacherName", "") + + if teacher_name: + # 处理目录数据中的教师信息 + for teacher in re.split(r"[,;,;、]", teacher_name): + if teacher.strip() and teacher.strip() not in instructors: + instructors.append(teacher.strip()) + + return instructors + + def _build_course_url(self, main_data): + """构建课程详情页面URL""" + course_id = main_data.get("courseId") + return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" + + +# ============================================================================ +# 向后兼容性函数 +# 这些函数保持与旧版本爬虫的接口兼容,确保现有代码不会中断 +# ============================================================================ + +# 全局爬虫实例(单例模式) +_crawler = None + + +def _get_crawler(): + """ + 获取爬虫实例(单例模式) + + 使用单例模式确保整个应用中只有一个爬虫实例, + 避免重复初始化和多余的网络连接。 + + Returns: + CourseSelCrawler: 爬虫实例 + """ + global _crawler + if _crawler is None: + _crawler = CourseSelCrawler() + return _crawler + + +def crawl_program_urls(): + """ + 获取所有课程URL(兼容性接口) + + 这是旧版本爬虫的主要接口,现在内部使用新的API爬虫。 + 为了保持向后兼容性,该函数仍然返回课程URL列表, + 但同时会缓存完整的课程数据供其他函数使用。 + + Returns: + list: 课程URL列表 + """ + crawler = _get_crawler() + courses = crawler.get_all_courses() + + # 提取课程URL列表以保持接口兼容性 + course_urls = [] + for course in courses: + if course.get("url"): + course_urls.append(course["url"]) + + # 将完整的课程数据缓存起来,供 _crawl_course_data 函数使用 + # 这样避免了重复的网络请求 + if not hasattr(crawl_program_urls, "_course_data_cache"): + crawl_program_urls._course_data_cache = {} + + for course in courses: + if course.get("url"): + crawl_program_urls._course_data_cache[course["url"]] = course + + return course_urls + + +def _get_department_urls_from_url(_): + """ + 兼容性函数:从URL获取部门课程URL + + 注意:由于新的API架构,这个函数实际上直接调用主爬取函数。 + 参数被忽略,因为新的API不需要基于URL进行增量爬取。 + + Returns: + list: 课程URL列表 + """ + return crawl_program_urls() + + +def _is_department_url(candidate_url): + """ + 检查URL是否为有效的课程详情URL + + Args: + candidate_url (str): 候选URL + + Returns: + bool: 如果URL匹配课程详情页面格式则返回True + """ + return candidate_url.startswith(COURSE_DETAIL_URL_PREFIX) + + +def _crawl_course_data(course_url): + """ + 爬取单个课程数据(兼容性接口) + + 在新架构中,我们一次性获取所有课程数据并缓存, + 所以这个函数直接从缓存中返回数据,不进行实际的网络请求。 + + Args: + course_url (str): 课程详情页面URL + + Returns: + dict: 课程数据字典,如果未找到则返回空字典 + """ + # 从缓存中获取课程数据 + if hasattr(crawl_program_urls, "_course_data_cache"): + course_data = crawl_program_urls._course_data_cache.get(course_url) + if course_data: + return course_data + + # 如果缓存中没有找到数据,返回空字典 + return {} def import_department(department_data): + """ + 将课程数据导入数据库 + + 这个函数负责将爬取的课程数据保存到Django数据库中。 + 处理课程、教师和课程开设信息的创建和更新。 + + Args: + department_data (list): 课程数据列表,每个元素是课程信息字典 + """ for course_data in department_data: - course, created = Course.objects.update_or_create( + # 使用 update_or_create 确保数据的幂等性 + # 如果课程已存在则更新,否则创建新记录 + course, _ = Course.objects.update_or_create( course_code=course_data["course_code"], defaults={ "course_title": course_data["course_title"], @@ -144,32 +601,49 @@ def import_department(department_data): "description": course_data["description"], "course_topics": course_data["course_topics"], "url": course_data["url"], - # FIXME: invalid field source in course - # "source": Course.SOURCES.ORC, + # 注意:source字段在当前模型中不存在,已移除 }, ) - # Handle instructors + # 处理教师信息 if "instructors" in course_data and course_data["instructors"]: for instructor_name in course_data["instructors"]: + # 获取或创建教师记录 instructor, _ = Instructor.objects.get_or_create(name=instructor_name) - # Create a course offering for the current term if it doesn't exist + + # 为当前学期创建课程开设记录(如果不存在) offering, _ = CourseOffering.objects.get_or_create( course=course, term=CURRENT_TERM, defaults={"section": 1, "period": ""}, ) + # 将教师关联到课程开设记录 offering.instructors.add(instructor) def extract_prerequisites(pre_requisites): + """ + 处理先修课程字符串格式(兼容性函数) + + 这个函数对先修课程字符串进行标准化处理, + 统一格式和术语,使其更适合在系统中使用。 + + Args: + pre_requisites (str): 原始先修课程字符串 + + Returns: + str: 处理后的先修课程字符串 + """ result = pre_requisites + # 移除前缀标识 result = result.replace("Pre-requisites:", "").strip() + # 标准化学分要求术语 result = result.replace("Obtained Credit", "obtained_credit").strip() result = result.replace("Credits Submitted", "credits_submitted").strip() + # 标准化逻辑运算符格式(添加空格以提高可读性) result = result.replace("&&", " && ").strip() result = result.replace("||", " || ").strip() From ec757a3f79d1b2f6b0bb4ce6dc0f56829b6d8ae1 Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Wed, 3 Sep 2025 19:56:08 +0800 Subject: [PATCH 02/20] fix: interactive cookie input --- apps/spider/crawlers/orc.py | 395 +++++++++--------------------------- uv.lock | 52 +---- 2 files changed, 93 insertions(+), 354 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 9f7ab12..4abb3b4 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -9,120 +9,88 @@ from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM -# API端点配置 +# API endpoints BASE_URL = "https://coursesel.umji.sjtu.edu.cn" COURSE_DETAIL_URL_PREFIX = urljoin(BASE_URL, "/course/") -# 兼容性配置(保留旧版本接口) +# Legacy compatibility ORC_BASE_URL = BASE_URL UNDERGRAD_URL = BASE_URL -# 正则表达式模式 INSTRUCTOR_TERM_REGEX = re.compile(r"^(?P\w*)\s?(\((?P\w*)\))?") class CourseSelCrawler: """ - JI SJTU 课程选择系统爬虫模块 + JI SJTU Course Selection System Crawler - 该模块负责从上海交通大学密西根学院的课程选择系统中爬取课程信息, - 包括课程基本信息、先修课程要求、教师信息等。 - - 数据源说明: - 1. 选课任务API: 提供当前学期的课程开设信息、教师信息等 - 2. 课程目录API: 提供课程详细描述、额外的教师信息等 - 3. 先修课程API: 提供完整的先修课程逻辑关系 + Crawls course data from three APIs: + 1. Lesson tasks API: course offerings and basic info + 2. Course catalog API: detailed descriptions + 3. Prerequisites API: prerequisite rules """ def __init__(self): - """ - 初始化爬虫实例 - - 设置HTTP会话、cookies和headers,确保能够正常访问课程选择系统的API。 - """ + """Initialize crawler with session and authentication""" self.session = requests.Session() - # 必要的认证cookies(从浏览器中获取的有效会话ID) - cookies = { - "JSESSIONID": "your_own_cookie", - } + print("Please enter your JSESSIONID cookie:") + print("(Found in browser dev tools under Network or Application tabs)") + jsessionid = input("JSESSIONID: ").strip() + + if not jsessionid: + raise ValueError("JSESSIONID cannot be empty") + + cookies = {"JSESSIONID": jsessionid} self.session.cookies.update(cookies) - # 模拟浏览器请求的headers headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": "https://coursesel.umji.sjtu.edu.cn/", - "X-Requested-With": "XMLHttpRequest", # 标识AJAX请求 + "X-Requested-With": "XMLHttpRequest", } self.session.headers.update(headers) + print("Crawler initialized successfully!") + def get_all_courses(self): """ - 获取所有课程数据的主入口方法 - - 该方法协调多个数据源,获取完整的课程信息: - 1. 选课任务数据(包含当前学期开设信息) - 2. 课程目录数据(包含课程描述等详细信息) - 3. 先修课程数据(包含先修要求的逻辑关系) - + Get all course data from multiple APIs + Returns: - list: 包含所有课程信息的字典列表,每个字典包含: - - course_code: 课程代码 - - course_title: 课程标题 - - department: 院系代码 - - number: 课程编号 - - course_credits: 学分数 - - pre_requisites: 先修课程要求(字符串形式的逻辑表达式) - - description: 课程描述 - - instructors: 教师列表 - - url: 课程详情页面URL + list: Course data with prerequisites, descriptions, and instructors """ - # 获取三个主要数据源 courses_data = self._get_lesson_tasks() course_details = self._get_course_catalog() prerequisites = self._get_prerequisites() - # 整合所有数据源的信息 return self._integrate_course_data(courses_data, course_details, prerequisites) def _get_lesson_tasks(self): - """ - 获取选课任务数据(第一个数据源) - - 从选课系统的主API获取当前学期的所有选课任务信息。 - 这个API返回的数据包含: - - 课程基本信息(课程代码、名称、学分等) - - 教师信息 - - 开课时间和地点 - - 选课限制等信息 - - Returns: - list: 选课任务列表,每个任务包含课程的基本信息 - """ + """Get lesson task data from course selection API""" url = f"{BASE_URL}/tpm/findLessonTasksPreview_ElectTurn.action" - # API请求参数,控制返回的数据类型 json_params = { - "isToTheTime": True, # 是否到了选课时间 - "electTurnId": "93B7BAF9-7E8B-4D32-BCC8-DE49B320AB0A", # 选课轮次ID - "loadCourseGroup": True, # 加载课程组信息 - "loadElectTurn": True, # 加载选课轮次信息 - "loadCourseType": True, # 加载课程类型信息 - "loadCourseTypeCredit": True, # 加载课程类型学分信息 - "loadElectTurnResult": True, # 加载选课结果 - "loadStudentLessonTask": True, # 加载学生选课任务 - "loadPrerequisiteCourse": True, # 加载先修课程信息 - "loadLessonCalendarWeek": True, # 加载课程日历周信息 - "loadLessonCalendarConflict": True, # 加载时间冲突检查 - "loadTermCredit": True, # 加载学期学分信息 - "loadLessonTask": True, # 加载课程任务详情 - "loadDropApprove": True, # 加载退课审批信息 - "loadElectApprove": True, # 加载选课审批信息 + "isToTheTime": True, + "electTurnId": "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4", # Remember to update for new terms + "loadCourseGroup": True, + "loadElectTurn": True, + "loadCourseType": True, + "loadCourseTypeCredit": True, + "loadElectTurnResult": True, + "loadStudentLessonTask": True, + "loadPrerequisiteCourse": True, + "loadLessonCalendarWeek": True, + "loadLessonCalendarConflict": True, + "loadTermCredit": True, + "loadLessonTask": True, + "loadDropApprove": True, + "loadElectApprove": True, } - # 构建GET请求URL(该API要求JSON参数作为查询字符串) import urllib.parse + json_string = json.dumps(json_params, separators=(",", ":")) encoded_json = urllib.parse.quote(json_string) full_url = f"{url}?jsonString={encoded_json}" @@ -132,28 +100,14 @@ def _get_lesson_tasks(self): response.raise_for_status() data = response.json() - # 检查API响应格式并提取课程任务数据 if data.get("success") and "data" in data and "lessonTasks" in data["data"]: return data["data"]["lessonTasks"] return [] except Exception: - # 发生错误时返回空列表,不中断整个爬取流程 return [] def _get_course_catalog(self): - """ - 获取课程目录数据(第二个数据源) - - 从课程目录API获取课程的详细描述信息。 - 这个API主要提供: - - 课程的详细描述 - - 课程的英文名称 - - 额外的教师信息 - - 其他补充信息 - - Returns: - dict: 以courseId为键的课程详情字典 - """ + """Get course catalog data with detailed descriptions""" url = f"{BASE_URL}/jdji/tpm/findOwnCollegeCourse_JiCourse.action" try: @@ -161,39 +115,31 @@ def _get_course_catalog(self): response.raise_for_status() data = response.json() - if data.get("success") and "data" in data and "courses" in data["data"]: - # 将课程列表转换为courseId索引的字典以便快速查找 - return { - course.get("courseId"): course for course in data["data"]["courses"] - } + if data.get("success") and "data" in data: + # Handle both possible data structures + if isinstance(data["data"], list): + # Direct array of courses + courses = data["data"] + elif isinstance(data["data"], dict) and "courses" in data["data"]: + # Nested structure with courses key + courses = data["data"]["courses"] + else: + return {} + + return {course.get("courseId"): course for course in courses} return {} except Exception: - # 出错时返回空字典,不影响主流程 return {} def _get_prerequisites(self): - """ - 获取先修课程数据(第三个数据源) - - 从先修课程API获取所有课程的先修要求。 - 这个API返回的数据包含: - - 课程ID和对应的先修课程ID - - 先修课程的逻辑关系(AND、OR等) - - 先修要求的详细描述(prerequisiteRuleDesc) - - Returns: - dict: 以courseId为键的先修课程列表字典 - 每个值是一个列表,包含该课程的所有先修要求项 - """ + """Get prerequisite data with course requirements and logic""" url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" try: - # 添加时间戳参数防止缓存 response = self.session.post(url, params={"_t": int(time.time() * 1000)}) response.raise_for_status() data = response.json() - # 调试信息:记录API响应状态 print(f"[DEBUG] Prerequisites API response: success={data.get('success')}") print( f"[DEBUG] Data keys: {list(data.keys()) if isinstance(data, dict) else 'Not dict'}" @@ -203,11 +149,9 @@ def _get_prerequisites(self): raw_prereqs = data["data"] print(f"[DEBUG] Raw prerequisites data: {len(raw_prereqs)} items") - # 显示第一个先修要求的结构作为调试信息 if raw_prereqs and len(raw_prereqs) > 0: print(f"[DEBUG] First prerequisite item: {raw_prereqs[0]}") - # 按courseId分组先修课程数据,这样可以快速查找每个课程的所有先修要求 prereqs = defaultdict(list) for item in raw_prereqs: course_id = item.get("courseId") @@ -226,33 +170,11 @@ def _get_prerequisites(self): return {} def _integrate_course_data(self, courses_data, course_details, prerequisites): - """ - 整合多个数据源的课程信息 - - 将三个不同API获取的数据进行整合,生成完整的课程信息: - 1. courses_data: 选课任务数据(主要数据源) - 2. course_details: 课程目录数据(补充描述等信息) - 3. prerequisites: 先修课程数据(先修要求) - - 数据整合策略: - - 以课程代码为主键进行分组 - - 合并同一课程的不同section信息 - - 优先使用更完整的数据字段 - - 为每个课程匹配对应的先修要求 - - Args: - courses_data (list): 选课任务数据列表 - course_details (dict): 课程详情字典,以courseId为键 - prerequisites (dict): 先修课程字典,以courseId为键 - - Returns: - list: 整合后的课程信息列表 - """ + """Integrate course data from multiple sources""" print( f"[DEBUG] Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups" ) - # 按课程代码分组:同一门课程可能有多个section courses_by_code = defaultdict(list) for course in courses_data: course_code = course.get("courseCode") @@ -262,26 +184,21 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): integrated_courses = [] courses_with_prereqs = 0 - # 遍历每个课程代码,整合数据 for course_code, course_list in courses_by_code.items(): - # 合并同课程的不同section(主要是合并教师信息) merged = self._merge_course_sections(course_list) if not merged: continue - # 获取课程ID并查找对应的详细信息和先修要求 course_id = merged.get("courseId") catalog_info = course_details.get(course_id, {}) prereq_info = prerequisites.get(course_id, []) - # 统计有先修要求的课程数量(用于调试) if prereq_info: courses_with_prereqs += 1 print( f"[DEBUG] Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" ) - # 构建最终的课程数据记录 course_data = self._build_course_record( course_code, merged, catalog_info, prereq_info ) @@ -295,75 +212,34 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): return integrated_courses def _merge_course_sections(self, course_list): - """ - 合并同一课程的不同section信息 - - 同一门课程可能有多个section(如不同时间的讲座、实验课等), - 需要将它们的信息合并,特别是教师信息。 - - Args: - course_list (list): 同一课程的section列表 - - Returns: - dict: 合并后的课程信息,包含所有section的教师信息 - """ + """Merge sections of the same course""" if not course_list: return {} - # 以第一个section为基础进行合并 merged = course_list[0].copy() all_instructors = set() - # 收集所有section的教师信息 for course in course_list: teachers = course.get("lessonTaskTeam", "") if teachers: - # 处理多种分隔符(中英文逗号、分号等) for teacher in re.split(r"[,;,;、]", teachers): if teacher.strip(): all_instructors.add(teacher.strip()) - # 将所有教师信息存储到合并后的课程数据中 merged["all_instructors"] = list(all_instructors) return merged def _build_course_record(self, course_code, main_data, catalog_data, prereq_data): - """ - 构建标准格式的课程记录 - - 将来自不同数据源的课程信息整合成统一的格式。 - 数据优先级:main_data > catalog_data(优先使用主数据源) - - Args: - course_code (str): 课程代码(如 "ECE2150J") - main_data (dict): 选课任务数据(主数据源) - catalog_data (dict): 课程目录数据(补充数据源) - prereq_data (list): 先修课程数据列表 - - Returns: - dict: 标准化的课程记录,如果数据无效则返回None - """ - # 1. 提取并验证课程标题 + """Build standardized course record""" course_title = self._extract_course_title(main_data, catalog_data) if not course_title: return None - # 2. 解析课程代码中的院系和编号信息 department, number = self._parse_course_code(course_code) - - # 3. 提取学分信息 course_credits = self._extract_course_credits(main_data, catalog_data) - - # 4. 构建先修课程字符串 prerequisites = self._build_prerequisites_string(course_code, prereq_data) - - # 5. 提取课程描述 description = self._extract_description(main_data, catalog_data) - - # 6. 整合教师信息 instructors = self._extract_instructors(main_data, catalog_data) - - # 7. 构建课程URL course_url = self._build_course_url(main_data) return { @@ -374,122 +250,107 @@ def _build_course_record(self, course_code, main_data, catalog_data, prereq_data "course_credits": course_credits, "pre_requisites": prerequisites, "description": description, - "course_topics": [], # 暂时为空,保持接口一致 + "course_topics": [], "instructors": instructors, "url": course_url, } - + def _extract_course_title(self, main_data, catalog_data): - """提取课程标题(优先使用英文名称)""" + """Extract course title (prefer English name)""" return ( main_data.get("courseNameEn", "") or main_data.get("courseName", "") or catalog_data.get("courseNameEn", "") or catalog_data.get("courseName", "") ).strip() - + def _parse_course_code(self, course_code): - """从课程代码中解析院系和编号""" + """Parse department and number from course code""" department = "" number = 0 - + if course_code: - # 提取院系代码(字母部分) dept_match = re.match(r"^([A-Z]+)", course_code) if dept_match: department = dept_match.group(1) - # 提取课程编号(数字部分) num_match = re.search(r"(\d+)", course_code) if num_match: number = int(num_match.group(1)) - + return department, number - + def _extract_course_credits(self, main_data, catalog_data): - """提取课程学分信息""" - course_credits = main_data.get("totalCredit", 0) or catalog_data.get("credit", 0) - + """Extract course credits""" + course_credits = main_data.get("totalCredit", 0) or catalog_data.get( + "credit", 0 + ) + if isinstance(course_credits, str): try: course_credits = int(float(course_credits)) except (ValueError, TypeError): course_credits = 0 - + return course_credits - + def _build_prerequisites_string(self, course_code, prereq_data): - """构建先修课程字符串""" + """Build prerequisites string from API data""" if not prereq_data: return "" - + print( f"[DEBUG] Building prerequisites for {course_code}, prereq_data has {len(prereq_data)} items" ) - + prereq_codes = [] for item in prereq_data: rule_desc = item.get("prerequisiteRuleDesc", "") print(f"[DEBUG] Processing prerequisite rule: {rule_desc}") if rule_desc: - # 使用完整的规则描述 prereq_codes.append(rule_desc) if prereq_codes: prerequisites = " || ".join(prereq_codes) print(f"[DEBUG] Final prerequisites for {course_code}: {prerequisites}") return prerequisites - + return "" - + def _extract_description(self, main_data, catalog_data): - """提取课程描述信息""" + """Extract course description""" return ( main_data.get("description", "") or catalog_data.get("description", "") or main_data.get("memo", "") or catalog_data.get("memo", "") ).strip() - + def _extract_instructors(self, main_data, catalog_data): - """整合教师信息""" + """Extract and merge instructor information""" instructors = main_data.get("all_instructors", []) teacher_name = catalog_data.get("teacherName", "") - + if teacher_name: - # 处理目录数据中的教师信息 for teacher in re.split(r"[,;,;、]", teacher_name): if teacher.strip() and teacher.strip() not in instructors: instructors.append(teacher.strip()) - + return instructors - + def _build_course_url(self, main_data): - """构建课程详情页面URL""" + """Build course detail page URL""" course_id = main_data.get("courseId") return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" -# ============================================================================ -# 向后兼容性函数 -# 这些函数保持与旧版本爬虫的接口兼容,确保现有代码不会中断 -# ============================================================================ - -# 全局爬虫实例(单例模式) +# Legacy compatibility functions _crawler = None def _get_crawler(): - """ - 获取爬虫实例(单例模式) - - 使用单例模式确保整个应用中只有一个爬虫实例, - 避免重复初始化和多余的网络连接。 - - Returns: - CourseSelCrawler: 爬虫实例 - """ + """Get crawler instance (singleton pattern)""" global _crawler if _crawler is None: _crawler = CourseSelCrawler() @@ -497,27 +358,15 @@ def _get_crawler(): def crawl_program_urls(): - """ - 获取所有课程URL(兼容性接口) - - 这是旧版本爬虫的主要接口,现在内部使用新的API爬虫。 - 为了保持向后兼容性,该函数仍然返回课程URL列表, - 但同时会缓存完整的课程数据供其他函数使用。 - - Returns: - list: 课程URL列表 - """ + """Get all course URLs (legacy interface)""" crawler = _get_crawler() courses = crawler.get_all_courses() - # 提取课程URL列表以保持接口兼容性 course_urls = [] for course in courses: if course.get("url"): course_urls.append(course["url"]) - # 将完整的课程数据缓存起来,供 _crawl_course_data 函数使用 - # 这样避免了重复的网络请求 if not hasattr(crawl_program_urls, "_course_data_cache"): crawl_program_urls._course_data_cache = {} @@ -529,67 +378,28 @@ def crawl_program_urls(): def _get_department_urls_from_url(_): - """ - 兼容性函数:从URL获取部门课程URL - - 注意:由于新的API架构,这个函数实际上直接调用主爬取函数。 - 参数被忽略,因为新的API不需要基于URL进行增量爬取。 - - Returns: - list: 课程URL列表 - """ + """Legacy function: get department course URLs""" return crawl_program_urls() def _is_department_url(candidate_url): - """ - 检查URL是否为有效的课程详情URL - - Args: - candidate_url (str): 候选URL - - Returns: - bool: 如果URL匹配课程详情页面格式则返回True - """ + """Check if URL is a valid course detail URL""" return candidate_url.startswith(COURSE_DETAIL_URL_PREFIX) def _crawl_course_data(course_url): - """ - 爬取单个课程数据(兼容性接口) - - 在新架构中,我们一次性获取所有课程数据并缓存, - 所以这个函数直接从缓存中返回数据,不进行实际的网络请求。 - - Args: - course_url (str): 课程详情页面URL - - Returns: - dict: 课程数据字典,如果未找到则返回空字典 - """ - # 从缓存中获取课程数据 + """Crawl single course data (legacy interface)""" if hasattr(crawl_program_urls, "_course_data_cache"): course_data = crawl_program_urls._course_data_cache.get(course_url) if course_data: return course_data - # 如果缓存中没有找到数据,返回空字典 return {} def import_department(department_data): - """ - 将课程数据导入数据库 - - 这个函数负责将爬取的课程数据保存到Django数据库中。 - 处理课程、教师和课程开设信息的创建和更新。 - - Args: - department_data (list): 课程数据列表,每个元素是课程信息字典 - """ + """Import course data to database""" for course_data in department_data: - # 使用 update_or_create 确保数据的幂等性 - # 如果课程已存在则更新,否则创建新记录 course, _ = Course.objects.update_or_create( course_code=course_data["course_code"], defaults={ @@ -601,49 +411,28 @@ def import_department(department_data): "description": course_data["description"], "course_topics": course_data["course_topics"], "url": course_data["url"], - # 注意:source字段在当前模型中不存在,已移除 }, ) - # 处理教师信息 if "instructors" in course_data and course_data["instructors"]: for instructor_name in course_data["instructors"]: - # 获取或创建教师记录 instructor, _ = Instructor.objects.get_or_create(name=instructor_name) - - # 为当前学期创建课程开设记录(如果不存在) + offering, _ = CourseOffering.objects.get_or_create( course=course, term=CURRENT_TERM, defaults={"section": 1, "period": ""}, ) - # 将教师关联到课程开设记录 offering.instructors.add(instructor) def extract_prerequisites(pre_requisites): - """ - 处理先修课程字符串格式(兼容性函数) - - 这个函数对先修课程字符串进行标准化处理, - 统一格式和术语,使其更适合在系统中使用。 - - Args: - pre_requisites (str): 原始先修课程字符串 - - Returns: - str: 处理后的先修课程字符串 - """ + """Process prerequisite string format (legacy function)""" result = pre_requisites - # 移除前缀标识 result = result.replace("Pre-requisites:", "").strip() - - # 标准化学分要求术语 result = result.replace("Obtained Credit", "obtained_credit").strip() result = result.replace("Credits Submitted", "credits_submitted").strip() - - # 标准化逻辑运算符格式(添加空格以提高可读性) result = result.replace("&&", " && ").strip() result = result.replace("||", " || ").strip() diff --git a/uv.lock b/uv.lock index cf9c4de..e845ede 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,6 @@ version = 1 revision = 3 -requires-python = ">=3.12" +requires-python = ">=3.13" [[package]] name = "" @@ -195,19 +195,6 @@ version = "3.4.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188, upload-time = "2024-12-24T18:12:35.43Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105, upload-time = "2024-12-24T18:10:38.83Z" }, - { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404, upload-time = "2024-12-24T18:10:44.272Z" }, - { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423, upload-time = "2024-12-24T18:10:45.492Z" }, - { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184, upload-time = "2024-12-24T18:10:47.898Z" }, - { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268, upload-time = "2024-12-24T18:10:50.589Z" }, - { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601, upload-time = "2024-12-24T18:10:52.541Z" }, - { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098, upload-time = "2024-12-24T18:10:53.789Z" }, - { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520, upload-time = "2024-12-24T18:10:55.048Z" }, - { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852, upload-time = "2024-12-24T18:10:57.647Z" }, - { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488, upload-time = "2024-12-24T18:10:59.43Z" }, - { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192, upload-time = "2024-12-24T18:11:00.676Z" }, - { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550, upload-time = "2024-12-24T18:11:01.952Z" }, - { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785, upload-time = "2024-12-24T18:11:03.142Z" }, { url = "https://files.pythonhosted.org/packages/38/94/ce8e6f63d18049672c76d07d119304e1e2d7c6098f0841b51c666e9f44a0/charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda", size = 195698, upload-time = "2024-12-24T18:11:05.834Z" }, { url = "https://files.pythonhosted.org/packages/24/2e/dfdd9770664aae179a96561cc6952ff08f9a8cd09a908f259a9dfa063568/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313", size = 140162, upload-time = "2024-12-24T18:11:07.064Z" }, { url = "https://files.pythonhosted.org/packages/24/4e/f646b9093cff8fc86f2d60af2de4dc17c759de9d554f130b140ea4738ca6/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9", size = 150263, upload-time = "2024-12-24T18:11:08.374Z" }, @@ -323,13 +310,6 @@ version = "0.1.10" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/23/76/03fc9fb3441a13e9208bb6103ebb7200eba7647d040008b8303a1c03e152/cwcwidth-0.1.10.tar.gz", hash = "sha256:7468760f72c1f4107be1b2b2854bc000401ea36a69daed36fb966a1e19a7a124", size = 60265, upload-time = "2025-02-09T21:15:28.452Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/87/28/8e2ab81f0116bfcec22069e4c92fda9d05b0512605ccef00b62d93719ded/cwcwidth-0.1.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1d2b21ff2eb60c6793349b7fb161c40a8583a57ec32e61f47aab7938177bfdec", size = 23031, upload-time = "2025-02-09T21:14:59.01Z" }, - { url = "https://files.pythonhosted.org/packages/3a/a4/5adc535e2a714ecc926ea701e821a9abbe14f65cae4d615d20059b9b52a5/cwcwidth-0.1.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0316488349c3e5ca4b20de7daa1cb8e96a05d1d14d040d46e87a495da655f4a", size = 101219, upload-time = "2025-02-09T21:15:00.079Z" }, - { url = "https://files.pythonhosted.org/packages/78/4c/18a5a06aa8db3cc28712ab957671e7718aedfc73403d84b0c2cb5cfcbc27/cwcwidth-0.1.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:848b6ffca1e32e28d2ccbb2cd395ccd3c38a7c4ec110728cd9d828eaf609b09e", size = 106565, upload-time = "2025-02-09T21:15:02.081Z" }, - { url = "https://files.pythonhosted.org/packages/06/40/801cba5ccb9551c862ad210eba22031e4655cd74711e32756b7ce24fc751/cwcwidth-0.1.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c3a7bfe1da478c0c27c549f68c6e28a583413da3ee451854ec2d983497bd18b8", size = 102244, upload-time = "2025-02-09T21:15:04.003Z" }, - { url = "https://files.pythonhosted.org/packages/e4/ed/60f61274fcfd0621a45e9403502e8f46968d562810a4424e5ff8d6bd50b0/cwcwidth-0.1.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cff03100f49170bc50fc399d05a31b8fcb7b0cef26df1a8068fa943387107f6c", size = 105634, upload-time = "2025-02-09T21:15:06.005Z" }, - { url = "https://files.pythonhosted.org/packages/b1/27/8179cecd688fef894dda601455d35066adfa3d58af4e97c5ab112893b5f6/cwcwidth-0.1.10-cp312-cp312-win32.whl", hash = "sha256:2dd9a92fdfbc53fc79f0953f39708dcf743fd27450c374985f419e3d47eb89d4", size = 23507, upload-time = "2025-02-09T21:15:07.968Z" }, - { url = "https://files.pythonhosted.org/packages/b2/b4/b7fe652a4d96f03ef051fff8313dfe827bc31578f7e67f1c98d5a5813f66/cwcwidth-0.1.10-cp312-cp312-win_amd64.whl", hash = "sha256:734d764281e3d87c40d0265543f00a653409145fa9f48a93bc0fbf9a8e7932ca", size = 26100, upload-time = "2025-02-09T21:15:09.186Z" }, { url = "https://files.pythonhosted.org/packages/af/f7/8c4cfe0b08053eea4da585ad5e12fef7cd11a0c9e4603ac8644c2a0b04b5/cwcwidth-0.1.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2391073280d774ab5d9af1d3aaa26ec456956d04daa1134fb71c31cd72ba5bba", size = 22344, upload-time = "2025-02-09T21:15:10.136Z" }, { url = "https://files.pythonhosted.org/packages/2a/48/176bbaf56520c5d6b72cbbe0d46821989eaa30df628daa5baecdd7f35458/cwcwidth-0.1.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bfbdc2943631ec770ee781b35b8876fa7e283ff2273f944e2a9ae1f3df4ecdf", size = 94907, upload-time = "2025-02-09T21:15:11.178Z" }, { url = "https://files.pythonhosted.org/packages/bc/fc/4dfed13b316a67bf2419a63db53566e3e5e4d4fc5a94ef493d3334be3c1f/cwcwidth-0.1.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb0103c7db8d86e260e016ff89f8f00ef5eb75c481abc346bfaa756da9f976b4", size = 100046, upload-time = "2025-02-09T21:15:12.279Z" }, @@ -495,15 +475,6 @@ version = "3.2.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/3f/74/907bb43af91782e0366b0960af62a8ce1f9398e4291cac7beaeffbee0c04/greenlet-3.2.1.tar.gz", hash = "sha256:9f4dd4b4946b14bb3bf038f81e1d2e535b7d94f1b2a59fdba1293cd9c1a0a4d7", size = 184475, upload-time = "2025-04-22T14:40:18.206Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/d1/e4777b188a04726f6cf69047830d37365b9191017f54caf2f7af336a6f18/greenlet-3.2.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:0ba2811509a30e5f943be048895a983a8daf0b9aa0ac0ead526dfb5d987d80ea", size = 270381, upload-time = "2025-04-22T14:25:43.69Z" }, - { url = "https://files.pythonhosted.org/packages/59/e7/b5b738f5679247ddfcf2179c38945519668dced60c3164c20d55c1a7bb4a/greenlet-3.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4245246e72352b150a1588d43ddc8ab5e306bef924c26571aafafa5d1aaae4e8", size = 637195, upload-time = "2025-04-22T14:53:44.563Z" }, - { url = "https://files.pythonhosted.org/packages/6c/9f/57968c88a5f6bc371364baf983a2e5549cca8f503bfef591b6dd81332cbc/greenlet-3.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7abc0545d8e880779f0c7ce665a1afc3f72f0ca0d5815e2b006cafc4c1cc5840", size = 651381, upload-time = "2025-04-22T14:54:59.439Z" }, - { url = "https://files.pythonhosted.org/packages/40/81/1533c9a458e9f2ebccb3ae22f1463b2093b0eb448a88aac36182f1c2cd3d/greenlet-3.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6dcc6d604a6575c6225ac0da39df9335cc0c6ac50725063fa90f104f3dbdb2c9", size = 646110, upload-time = "2025-04-22T15:04:35.739Z" }, - { url = "https://files.pythonhosted.org/packages/06/66/25f7e4b1468ebe4a520757f2e41c2a36a2f49a12e963431b82e9f98df2a0/greenlet-3.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2273586879affca2d1f414709bb1f61f0770adcabf9eda8ef48fd90b36f15d12", size = 648070, upload-time = "2025-04-22T14:27:05.976Z" }, - { url = "https://files.pythonhosted.org/packages/d7/4c/49d366565c4c4d29e6f666287b9e2f471a66c3a3d8d5066692e347f09e27/greenlet-3.2.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff38c869ed30fff07f1452d9a204ece1ec6d3c0870e0ba6e478ce7c1515acf22", size = 603816, upload-time = "2025-04-22T14:25:57.224Z" }, - { url = "https://files.pythonhosted.org/packages/04/15/1612bb61506f44b6b8b6bebb6488702b1fe1432547e95dda57874303a1f5/greenlet-3.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e934591a7a4084fa10ee5ef50eb9d2ac8c4075d5c9cf91128116b5dca49d43b1", size = 1119572, upload-time = "2025-04-22T14:58:58.277Z" }, - { url = "https://files.pythonhosted.org/packages/cc/2f/002b99dacd1610e825876f5cbbe7f86740aa2a6b76816e5eca41c8457e85/greenlet-3.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:063bcf7f8ee28eb91e7f7a8148c65a43b73fbdc0064ab693e024b5a940070145", size = 1147442, upload-time = "2025-04-22T14:28:11.243Z" }, - { url = "https://files.pythonhosted.org/packages/c0/ba/82a2c3b9868644ee6011da742156247070f30e952f4d33f33857458450f2/greenlet-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7132e024ebeeeabbe661cf8878aac5d2e643975c4feae833142592ec2f03263d", size = 296207, upload-time = "2025-04-22T14:54:40.531Z" }, { url = "https://files.pythonhosted.org/packages/77/2a/581b3808afec55b2db838742527c40b4ce68b9b64feedff0fd0123f4b19a/greenlet-3.2.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:e1967882f0c42eaf42282a87579685c8673c51153b845fde1ee81be720ae27ac", size = 269119, upload-time = "2025-04-22T14:25:01.798Z" }, { url = "https://files.pythonhosted.org/packages/b0/f3/1c4e27fbdc84e13f05afc2baf605e704668ffa26e73a43eca93e1120813e/greenlet-3.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e77ae69032a95640a5fe8c857ec7bee569a0997e809570f4c92048691ce4b437", size = 637314, upload-time = "2025-04-22T14:53:46.214Z" }, { url = "https://files.pythonhosted.org/packages/fc/1a/9fc43cb0044f425f7252da9847893b6de4e3b20c0a748bce7ab3f063d5bc/greenlet-3.2.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3227c6ec1149d4520bc99edac3b9bc8358d0034825f3ca7572165cb502d8f29a", size = 651421, upload-time = "2025-04-22T14:55:00.852Z" }, @@ -640,18 +611,6 @@ version = "2.9.10" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/cb/0e/bdc8274dc0585090b4e3432267d7be4dfbfd8971c0fa59167c711105a6bf/psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2", size = 385764, upload-time = "2024-10-16T11:24:58.126Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/49/7d/465cc9795cf76f6d329efdafca74693714556ea3891813701ac1fee87545/psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0", size = 3044771, upload-time = "2024-10-16T11:20:35.234Z" }, - { url = "https://files.pythonhosted.org/packages/8b/31/6d225b7b641a1a2148e3ed65e1aa74fc86ba3fee850545e27be9e1de893d/psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a", size = 3275336, upload-time = "2024-10-16T11:20:38.742Z" }, - { url = "https://files.pythonhosted.org/packages/30/b7/a68c2b4bff1cbb1728e3ec864b2d92327c77ad52edcd27922535a8366f68/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539", size = 2851637, upload-time = "2024-10-16T11:20:42.145Z" }, - { url = "https://files.pythonhosted.org/packages/0b/b1/cfedc0e0e6f9ad61f8657fd173b2f831ce261c02a08c0b09c652b127d813/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526", size = 3082097, upload-time = "2024-10-16T11:20:46.185Z" }, - { url = "https://files.pythonhosted.org/packages/18/ed/0a8e4153c9b769f59c02fb5e7914f20f0b2483a19dae7bf2db54b743d0d0/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1", size = 3264776, upload-time = "2024-10-16T11:20:50.879Z" }, - { url = "https://files.pythonhosted.org/packages/10/db/d09da68c6a0cdab41566b74e0a6068a425f077169bed0946559b7348ebe9/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e", size = 3020968, upload-time = "2024-10-16T11:20:56.819Z" }, - { url = "https://files.pythonhosted.org/packages/94/28/4d6f8c255f0dfffb410db2b3f9ac5218d959a66c715c34cac31081e19b95/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f", size = 2872334, upload-time = "2024-10-16T11:21:02.411Z" }, - { url = "https://files.pythonhosted.org/packages/05/f7/20d7bf796593c4fea95e12119d6cc384ff1f6141a24fbb7df5a668d29d29/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00", size = 2822722, upload-time = "2024-10-16T11:21:09.01Z" }, - { url = "https://files.pythonhosted.org/packages/4d/e4/0c407ae919ef626dbdb32835a03b6737013c3cc7240169843965cada2bdf/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5", size = 2920132, upload-time = "2024-10-16T11:21:16.339Z" }, - { url = "https://files.pythonhosted.org/packages/2d/70/aa69c9f69cf09a01da224909ff6ce8b68faeef476f00f7ec377e8f03be70/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47", size = 2959312, upload-time = "2024-10-16T11:21:25.584Z" }, - { url = "https://files.pythonhosted.org/packages/d3/bd/213e59854fafe87ba47814bf413ace0dcee33a89c8c8c814faca6bc7cf3c/psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64", size = 1025191, upload-time = "2024-10-16T11:21:29.912Z" }, - { url = "https://files.pythonhosted.org/packages/92/29/06261ea000e2dc1e22907dbbc483a1093665509ea586b29b8986a0e56733/psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0", size = 1164031, upload-time = "2024-10-16T11:21:34.211Z" }, { url = "https://files.pythonhosted.org/packages/3e/30/d41d3ba765609c0763505d565c4d12d8f3c79793f0d0f044ff5a28bf395b/psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d", size = 3044699, upload-time = "2024-10-16T11:21:42.841Z" }, { url = "https://files.pythonhosted.org/packages/35/44/257ddadec7ef04536ba71af6bc6a75ec05c5343004a7ec93006bee66c0bc/psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb", size = 3275245, upload-time = "2024-10-16T11:21:51.989Z" }, { url = "https://files.pythonhosted.org/packages/1b/11/48ea1cd11de67f9efd7262085588790a95d9dfcd9b8a687d46caf7305c1a/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7", size = 2851631, upload-time = "2024-10-16T11:21:57.584Z" }, @@ -746,15 +705,6 @@ version = "6.0.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, - { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, - { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, - { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" }, - { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" }, - { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" }, - { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, - { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, - { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, From 50d6dc09c2ee73d6aa46303ca3f19fd853fb1d9e Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Wed, 3 Sep 2025 20:23:05 +0800 Subject: [PATCH 03/20] fix: automatic params acquisition --- apps/spider/crawlers/orc.py | 78 ++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 4abb3b4..0acafd3 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -1,5 +1,6 @@ import re from urllib.parse import urljoin +import logging import json import requests @@ -19,6 +20,9 @@ INSTRUCTOR_TERM_REGEX = re.compile(r"^(?P\w*)\s?(\((?P\w*)\))?") +# Set up logger +logger = logging.getLogger(__name__) + class CourseSelCrawler: """ @@ -52,7 +56,7 @@ def __init__(self): } self.session.headers.update(headers) - print("Crawler initialized successfully!") + logger.info("Crawler initialized successfully!") def get_all_courses(self): """ @@ -67,13 +71,39 @@ def get_all_courses(self): return self._integrate_course_data(courses_data, course_details, prerequisites) + def _get_current_elect_turn_id(self): + """Get current election turn ID dynamically""" + url = f"{BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action" + + try: + response = self.session.get(url, params={"_t": int(time.time() * 1000)}) + response.raise_for_status() + data = response.json() + + if data and isinstance(data, list) and len(data) > 0: + # Get the first (current) election turn + current_turn = data[0] + elect_turn_id = current_turn.get("electTurnId") + if elect_turn_id: + logger.debug(f"Found current electTurnId: {elect_turn_id}") + return elect_turn_id + + logger.warning("Could not find current electTurnId, using fallback") + return "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4" # Fallback + except Exception as e: + logger.error(f"Error getting electTurnId: {e}, using fallback") + return "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4" # Fallback + def _get_lesson_tasks(self): """Get lesson task data from course selection API""" url = f"{BASE_URL}/tpm/findLessonTasksPreview_ElectTurn.action" + # Get current election turn ID dynamically + elect_turn_id = self._get_current_elect_turn_id() + json_params = { "isToTheTime": True, - "electTurnId": "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4", # Remember to update for new terms + "electTurnId": elect_turn_id, "loadCourseGroup": True, "loadElectTurn": True, "loadCourseType": True, @@ -116,16 +146,10 @@ def _get_course_catalog(self): data = response.json() if data.get("success") and "data" in data: - # Handle both possible data structures if isinstance(data["data"], list): - # Direct array of courses courses = data["data"] - elif isinstance(data["data"], dict) and "courses" in data["data"]: - # Nested structure with courses key - courses = data["data"]["courses"] else: return {} - return {course.get("courseId"): course for course in courses} return {} except Exception: @@ -140,17 +164,17 @@ def _get_prerequisites(self): response.raise_for_status() data = response.json() - print(f"[DEBUG] Prerequisites API response: success={data.get('success')}") - print( - f"[DEBUG] Data keys: {list(data.keys()) if isinstance(data, dict) else 'Not dict'}" + logger.debug(f"Prerequisites API response: success={data.get('success')}") + logger.debug( + f"Data keys: {list(data.keys()) if isinstance(data, dict) else 'Not dict'}" ) if data.get("success") and "data" in data: raw_prereqs = data["data"] - print(f"[DEBUG] Raw prerequisites data: {len(raw_prereqs)} items") + logger.debug(f"Raw prerequisites data: {len(raw_prereqs)} items") if raw_prereqs and len(raw_prereqs) > 0: - print(f"[DEBUG] First prerequisite item: {raw_prereqs[0]}") + logger.debug(f"First prerequisite item: {raw_prereqs[0]}") prereqs = defaultdict(list) for item in raw_prereqs: @@ -158,21 +182,21 @@ def _get_prerequisites(self): if course_id: prereqs[course_id].append(item) - print( - f"[DEBUG] Grouped prerequisites: {len(prereqs)} course IDs have prereqs" + logger.debug( + f"Grouped prerequisites: {len(prereqs)} course IDs have prereqs" ) return prereqs else: - print("[DEBUG] Prerequisites API failed or no data") + logger.warning("Prerequisites API failed or no data") return {} except Exception as e: - print(f"[DEBUG] Prerequisites API error: {str(e)}") + logger.error(f"Prerequisites API error: {str(e)}") return {} def _integrate_course_data(self, courses_data, course_details, prerequisites): """Integrate course data from multiple sources""" - print( - f"[DEBUG] Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups" + logger.info( + f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups" ) courses_by_code = defaultdict(list) @@ -195,8 +219,8 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): if prereq_info: courses_with_prereqs += 1 - print( - f"[DEBUG] Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" + logger.debug( + f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" ) course_data = self._build_course_record( @@ -206,8 +230,8 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): if course_data: integrated_courses.append(course_data) - print( - f"[DEBUG] Integration complete: {courses_with_prereqs} courses have prerequisites" + logger.info( + f"Integration complete: {courses_with_prereqs} courses have prerequisites" ) return integrated_courses @@ -299,21 +323,21 @@ def _build_prerequisites_string(self, course_code, prereq_data): if not prereq_data: return "" - print( - f"[DEBUG] Building prerequisites for {course_code}, prereq_data has {len(prereq_data)} items" + logger.debug( + f"Building prerequisites for {course_code}, prereq_data has {len(prereq_data)} items" ) prereq_codes = [] for item in prereq_data: rule_desc = item.get("prerequisiteRuleDesc", "") - print(f"[DEBUG] Processing prerequisite rule: {rule_desc}") + logger.debug(f"Processing prerequisite rule: {rule_desc}") if rule_desc: prereq_codes.append(rule_desc) if prereq_codes: prerequisites = " || ".join(prereq_codes) - print(f"[DEBUG] Final prerequisites for {course_code}: {prerequisites}") + logger.debug(f"Final prerequisites for {course_code}: {prerequisites}") return prerequisites return "" From c6c5902fe0dafffef15737758f71e2c23ac91ecb Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Wed, 3 Sep 2025 20:39:40 +0800 Subject: [PATCH 04/20] fix: explicit regex --- apps/spider/crawlers/orc.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 0acafd3..b65106b 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -289,18 +289,15 @@ def _extract_course_title(self, main_data, catalog_data): ).strip() def _parse_course_code(self, course_code): - """Parse department and number from course code""" department = "" number = 0 if course_code: - dept_match = re.match(r"^([A-Z]+)", course_code) - if dept_match: - department = dept_match.group(1) - - num_match = re.search(r"(\d+)", course_code) - if num_match: - number = int(num_match.group(1)) + # Match DEPT####J? (J is optional) + match = re.match(r"^([A-Z]{2,4})(\d{4})J?$", course_code) + if match: + department = match.group(1) + number = int(match.group(2)) return department, number From adfa159c97760c46198a80153129ac7aeea5944b Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Thu, 4 Sep 2025 03:33:19 +0800 Subject: [PATCH 05/20] fix: code quality --- apps/spider/crawlers/orc.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index b65106b..e8e6749 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urljoin +import urllib.parse import logging import json @@ -12,14 +12,12 @@ # API endpoints BASE_URL = "https://coursesel.umji.sjtu.edu.cn" -COURSE_DETAIL_URL_PREFIX = urljoin(BASE_URL, "/course/") +COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") # Legacy compatibility ORC_BASE_URL = BASE_URL UNDERGRAD_URL = BASE_URL -INSTRUCTOR_TERM_REGEX = re.compile(r"^(?P\w*)\s?(\((?P\w*)\))?") - # Set up logger logger = logging.getLogger(__name__) @@ -34,28 +32,39 @@ class CourseSelCrawler: 3. Prerequisites API: prerequisite rules """ - def __init__(self): + def __init__(self, jsessionid=None): """Initialize crawler with session and authentication""" self.session = requests.Session() + self.jsessionid = jsessionid + self._initialized = False + + logger.info("Crawler created (not yet initialized)") - print("Please enter your JSESSIONID cookie:") - print("(Found in browser dev tools under Network or Application tabs)") - jsessionid = input("JSESSIONID: ").strip() + def _ensure_initialized(self): + """Ensure crawler is properly initialized with authentication""" + if self._initialized: + return - if not jsessionid: + if not self.jsessionid: + print("Please enter your JSESSIONID cookie:") + print("(Found in browser dev tools under Network or Application tabs)") + self.jsessionid = input("JSESSIONID: ").strip() + + if not self.jsessionid: raise ValueError("JSESSIONID cannot be empty") - cookies = {"JSESSIONID": jsessionid} + cookies = {"JSESSIONID": self.jsessionid} self.session.cookies.update(cookies) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json, text/javascript, */*; q=0.01", - "Referer": "https://coursesel.umji.sjtu.edu.cn/", + "Referer": BASE_URL, "X-Requested-With": "XMLHttpRequest", } self.session.headers.update(headers) + self._initialized = True logger.info("Crawler initialized successfully!") def get_all_courses(self): @@ -119,8 +128,6 @@ def _get_lesson_tasks(self): "loadElectApprove": True, } - import urllib.parse - json_string = json.dumps(json_params, separators=(",", ":")) encoded_json = urllib.parse.quote(json_string) full_url = f"{url}?jsonString={encoded_json}" From 70372ac8b52e425ea212d39cca13325c892473df Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Thu, 4 Sep 2025 03:51:11 +0800 Subject: [PATCH 06/20] fix: CN-EN trans on prerequisites --- apps/spider/crawlers/orc.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index e8e6749..651b0a7 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -74,6 +74,8 @@ def get_all_courses(self): Returns: list: Course data with prerequisites, descriptions, and instructors """ + self._ensure_initialized() # Make sure crawler is initialized + courses_data = self._get_lesson_tasks() course_details = self._get_course_catalog() prerequisites = self._get_prerequisites() @@ -97,11 +99,13 @@ def _get_current_elect_turn_id(self): logger.debug(f"Found current electTurnId: {elect_turn_id}") return elect_turn_id - logger.warning("Could not find current electTurnId, using fallback") - return "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4" # Fallback + logger.error("Could not find current electTurnId in API response") + raise ValueError( + "Unable to get current electTurnId - API returned no valid election turns" + ) except Exception as e: - logger.error(f"Error getting electTurnId: {e}, using fallback") - return "1A5D7E45-4C23-4ED4-A3C2-90C45BE2E1E4" # Fallback + logger.error(f"Error getting electTurnId: {e}") + raise RuntimeError(f"Failed to retrieve electTurnId from API: {e}") from e def _get_lesson_tasks(self): """Get lesson task data from course selection API""" @@ -341,11 +345,34 @@ def _build_prerequisites_string(self, course_code, prereq_data): if prereq_codes: prerequisites = " || ".join(prereq_codes) + # Convert Chinese terms to English + prerequisites = self._normalize_prerequisites_to_english(prerequisites) logger.debug(f"Final prerequisites for {course_code}: {prerequisites}") return prerequisites return "" + def _normalize_prerequisites_to_english(self, prerequisites_text): + """Convert Chinese prerequisite terms to English""" + if not prerequisites_text: + return "" + + # Define translation mapping + translations = { + "已获学分": "Obtained Credit", + "已提交学分": "Credits Submitted", + "获得学分": "Obtained Credit", + "提交学分": "Credits Submitted", + "学分": "Credit", + } + + # Apply translations + normalized = prerequisites_text + for chinese, english in translations.items(): + normalized = normalized.replace(chinese, english) + + return normalized + def _extract_description(self, main_data, catalog_data): """Extract course description""" return ( From 2b93de4f334a333938463da19cdfe043638bb648 Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Thu, 4 Sep 2025 04:01:41 +0800 Subject: [PATCH 07/20] fix: remove unnecessary code --- apps/spider/crawlers/orc.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 651b0a7..c118c07 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -10,17 +10,13 @@ from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM +# Set up logger +logger = logging.getLogger(__name__) + # API endpoints BASE_URL = "https://coursesel.umji.sjtu.edu.cn" COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") -# Legacy compatibility -ORC_BASE_URL = BASE_URL -UNDERGRAD_URL = BASE_URL - -# Set up logger -logger = logging.getLogger(__name__) - class CourseSelCrawler: """ @@ -400,7 +396,6 @@ def _build_course_url(self, main_data): return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" -# Legacy compatibility functions _crawler = None @@ -432,16 +427,6 @@ def crawl_program_urls(): return course_urls -def _get_department_urls_from_url(_): - """Legacy function: get department course URLs""" - return crawl_program_urls() - - -def _is_department_url(candidate_url): - """Check if URL is a valid course detail URL""" - return candidate_url.startswith(COURSE_DETAIL_URL_PREFIX) - - def _crawl_course_data(course_url): """Crawl single course data (legacy interface)""" if hasattr(crawl_program_urls, "_course_data_cache"): From 71dc27a73d1b3c787745ecc746590b07d6d9c5e0 Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Wed, 17 Sep 2025 19:12:16 +0800 Subject: [PATCH 08/20] feat: combine official website and coursel --- apps/spider/crawlers/orc.py | 211 ++++++++++++++++++++++++++++++------ 1 file changed, 179 insertions(+), 32 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index c118c07..86d085b 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -6,17 +6,29 @@ import requests import time from collections import defaultdict +from urllib.parse import urljoin +from apps.spider.utils import retrieve_soup from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM # Set up logger logger = logging.getLogger(__name__) -# API endpoints +# API endpoints for course selection system BASE_URL = "https://coursesel.umji.sjtu.edu.cn" COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") +# Official website endpoints for detailed course info +OFFICIAL_BASE_URL = "https://www.ji.sjtu.edu.cn/" +OFFICIAL_ORC_BASE_URL = urljoin( + OFFICIAL_BASE_URL, "/academics/courses/courses-by-number/" +) +OFFICIAL_COURSE_DETAIL_URL_PREFIX = ( + "https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id=" +) +OFFICIAL_UNDERGRAD_URL = OFFICIAL_ORC_BASE_URL + class CourseSelCrawler: """ @@ -65,18 +77,24 @@ def _ensure_initialized(self): def get_all_courses(self): """ - Get all course data from multiple APIs + Get all course data from multiple APIs and official website Returns: list: Course data with prerequisites, descriptions, and instructors """ self._ensure_initialized() # Make sure crawler is initialized + # Get data from course selection APIs courses_data = self._get_lesson_tasks() course_details = self._get_course_catalog() prerequisites = self._get_prerequisites() - return self._integrate_course_data(courses_data, course_details, prerequisites) + # Get official website data for enhanced descriptions + official_data = self._get_official_website_data() + + return self._integrate_course_data( + courses_data, course_details, prerequisites, official_data + ) def _get_current_elect_turn_id(self): """Get current election turn ID dynamically""" @@ -200,10 +218,116 @@ def _get_prerequisites(self): logger.error(f"Prerequisites API error: {str(e)}") return {} - def _integrate_course_data(self, courses_data, course_details, prerequisites): + def _get_official_website_data(self): + """Get course data from official JI website for enhanced descriptions""" + logger.info("Fetching course data from official website") + + try: + # Get all course URLs from official website + official_urls = self._get_official_course_urls() + official_data = {} + + for url in official_urls: + course_data = self._crawl_official_course_data(url) + if course_data and course_data.get("course_code"): + official_data[course_data["course_code"]] = course_data + + logger.info(f"Fetched official data for {len(official_data)} courses") + return official_data + + except Exception as e: + logger.error(f"Error fetching official website data: {str(e)}") + return {} + + def _get_official_course_urls(self): + """Get all course URLs from official website""" + try: + soup = retrieve_soup(OFFICIAL_UNDERGRAD_URL) + linked_urls = [ + urljoin(OFFICIAL_BASE_URL, a["href"]) + for a in soup.find_all("a", href=True) + ] + return set( + linked_url + for linked_url in linked_urls + if self._is_official_course_url(linked_url) + ) + except Exception as e: + logger.error(f"Error getting official course URLs: {str(e)}") + return set() + + def _is_official_course_url(self, candidate_url): + """Check if URL is a valid official course detail URL""" + return candidate_url.startswith(OFFICIAL_COURSE_DETAIL_URL_PREFIX) + + def _crawl_official_course_data(self, course_url): + """Crawl single course data from official website""" + try: + soup = retrieve_soup(course_url) + course_heading = soup.find("h2") + if not course_heading: + return None + + course_heading_text = course_heading.get_text() + if not course_heading_text: + return None + + split_course_heading = course_heading_text.split(" – ") + if len(split_course_heading) < 2: + return None + + children = list(soup.find_all(class_="et_pb_text_inner")[3].children) + + course_code = split_course_heading[0] + course_title = split_course_heading[1] + + description = "" + course_topics = [] + official_url = course_url + + for i, child in enumerate(children): + text = child.get_text(strip=True) if hasattr(child, "get_text") else "" + if "Description:" in text: + description = ( + children[i + 2].get_text(strip=True) + if i + 2 < len(children) + else "" + ) + if description == "\n" or "Course Topics" in description: + description = "" + elif "Course Topics:" in text: + course_topics = ( + [ + li.get_text(strip=True) + for li in children[i + 2].find_all("li") + ] + if i + 2 < len(children) + else [] + ) + + return { + "course_code": course_code, + "course_title": course_title, + "description": description, + "course_topics": course_topics, + "official_url": official_url, + } + + except Exception as e: + logger.error( + f"Error crawling official course data from {course_url}: {str(e)}" + ) + return None + + def _integrate_course_data( + self, courses_data, course_details, prerequisites, official_data=None + ): """Integrate course data from multiple sources""" + if official_data is None: + official_data = {} + logger.info( - f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups" + f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups, {len(official_data)} official records" ) courses_by_code = defaultdict(list) @@ -223,6 +347,7 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): course_id = merged.get("courseId") catalog_info = course_details.get(course_id, {}) prereq_info = prerequisites.get(course_id, []) + official_info = official_data.get(course_code, {}) if prereq_info: courses_with_prereqs += 1 @@ -231,7 +356,7 @@ def _integrate_course_data(self, courses_data, course_details, prerequisites): ) course_data = self._build_course_record( - course_code, merged, catalog_info, prereq_info + course_code, merged, catalog_info, prereq_info, official_info ) if course_data: @@ -260,18 +385,30 @@ def _merge_course_sections(self, course_list): merged["all_instructors"] = list(all_instructors) return merged - def _build_course_record(self, course_code, main_data, catalog_data, prereq_data): - """Build standardized course record""" - course_title = self._extract_course_title(main_data, catalog_data) + def _build_course_record( + self, course_code, main_data, catalog_data, prereq_data, official_data=None + ): + """Build standardized course record with official website data integration""" + if official_data is None: + official_data = {} + + course_title = self._extract_course_title( + main_data, catalog_data, official_data + ) if not course_title: return None department, number = self._parse_course_code(course_code) course_credits = self._extract_course_credits(main_data, catalog_data) prerequisites = self._build_prerequisites_string(course_code, prereq_data) - description = self._extract_description(main_data, catalog_data) + description = self._extract_description(official_data) instructors = self._extract_instructors(main_data, catalog_data) - course_url = self._build_course_url(main_data) + # Get course topics and official URL from official website data + course_topics = official_data.get("course_topics", []) + official_url = official_data.get("official_url", "") + + # Use official URL as primary URL, fallback to API URL if not available + course_url = official_url or self._build_course_url(main_data) return { "course_code": course_code, @@ -281,15 +418,20 @@ def _build_course_record(self, course_code, main_data, catalog_data, prereq_data "course_credits": course_credits, "pre_requisites": prerequisites, "description": description, - "course_topics": [], + "course_topics": course_topics, "instructors": instructors, "url": course_url, + "official_url": official_url, } - def _extract_course_title(self, main_data, catalog_data): + def _extract_course_title(self, main_data, catalog_data, official_data=None): """Extract course title (prefer English name)""" + if official_data is None: + official_data = {} + return ( - main_data.get("courseNameEn", "") + official_data.get("course_title", "") + or main_data.get("courseNameEn", "") or main_data.get("courseName", "") or catalog_data.get("courseNameEn", "") or catalog_data.get("courseName", "") @@ -369,14 +511,12 @@ def _normalize_prerequisites_to_english(self, prerequisites_text): return normalized - def _extract_description(self, main_data, catalog_data): - """Extract course description""" - return ( - main_data.get("description", "") - or catalog_data.get("description", "") - or main_data.get("memo", "") - or catalog_data.get("memo", "") - ).strip() + def _extract_description(self, official_data=None): + """Extract course description (only from official website)""" + if official_data is None: + official_data = {} + + return official_data.get("description", "").strip() def _extract_instructors(self, main_data, catalog_data): """Extract and merge instructor information""" @@ -440,18 +580,25 @@ def _crawl_course_data(course_url): def import_department(department_data): """Import course data to database""" for course_data in department_data: + # Prepare defaults dict with all available fields + defaults = { + "course_title": course_data["course_title"], + "department": course_data["department"], + "number": course_data["number"], + "course_credits": course_data["course_credits"], + "pre_requisites": course_data["pre_requisites"], + "description": course_data["description"], + "course_topics": course_data["course_topics"], + "url": course_data["url"], + } + + # Add official_url if available + if "official_url" in course_data: + defaults["official_url"] = course_data["official_url"] + course, _ = Course.objects.update_or_create( course_code=course_data["course_code"], - defaults={ - "course_title": course_data["course_title"], - "department": course_data["department"], - "number": course_data["number"], - "course_credits": course_data["course_credits"], - "pre_requisites": course_data["pre_requisites"], - "description": course_data["description"], - "course_topics": course_data["course_topics"], - "url": course_data["url"], - }, + defaults=defaults, ) if "instructors" in course_data and course_data["instructors"]: From deee9f3abbf56a2bf8fc2b559c9a137734bb75c6 Mon Sep 17 00:00:00 2001 From: LuckySugar0716 Date: Wed, 17 Sep 2025 19:39:11 +0800 Subject: [PATCH 09/20] fix: provide options for different website --- apps/spider/crawlers/orc.py | 141 +++++++++++++++++++++++++++--------- 1 file changed, 106 insertions(+), 35 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 86d085b..0056dd3 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -82,20 +82,53 @@ def get_all_courses(self): Returns: list: Course data with prerequisites, descriptions, and instructors """ - self._ensure_initialized() # Make sure crawler is initialized - - # Get data from course selection APIs - courses_data = self._get_lesson_tasks() - course_details = self._get_course_catalog() - prerequisites = self._get_prerequisites() + # Ask user which data sources to use + use_coursesel = self._ask_user_choice( + "Crawl course selection system data? (y/n): ", default="y" + ) + use_official = self._ask_user_choice( + "Crawl official website data? (y/n): ", default="y" + ) - # Get official website data for enhanced descriptions - official_data = self._get_official_website_data() + courses_data = [] + course_details = {} + prerequisites = {} + official_data = {} + + if use_coursesel: + self._ensure_initialized() # Make sure crawler is initialized + print("Crawling course selection system data...") + # Get data from course selection APIs + courses_data = self._get_lesson_tasks() + course_details = self._get_course_catalog() + prerequisites = self._get_prerequisites() + else: + print("Skipping course selection system data") + + if use_official: + print("Crawling official website data...") + # Get official website data for enhanced descriptions + official_data = self._get_official_website_data() + else: + print("Skipping official website data") return self._integrate_course_data( courses_data, course_details, prerequisites, official_data ) + def _ask_user_choice(self, prompt, default="y"): + """Ask user for yes/no choice with default value""" + while True: + response = input(prompt).strip().lower() + if not response: + response = default.lower() + if response in ["y", "yes", "true"]: + return True + elif response in ["n", "no", "false"]: + return False + else: + print("Please enter y/yes or n/no") + def _get_current_elect_turn_id(self): """Get current election turn ID dynamically""" url = f"{BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action" @@ -330,40 +363,62 @@ def _integrate_course_data( f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups, {len(official_data)} official records" ) - courses_by_code = defaultdict(list) - for course in courses_data: - course_code = course.get("courseCode") - if course_code: - courses_by_code[course_code].append(course) - integrated_courses = [] courses_with_prereqs = 0 - for course_code, course_list in courses_by_code.items(): - merged = self._merge_course_sections(course_list) - if not merged: - continue - - course_id = merged.get("courseId") - catalog_info = course_details.get(course_id, {}) - prereq_info = prerequisites.get(course_id, []) - official_info = official_data.get(course_code, {}) + # If we have course selection data, process it + if courses_data: + courses_by_code = defaultdict(list) + for course in courses_data: + course_code = course.get("courseCode") + if course_code: + courses_by_code[course_code].append(course) + + for course_code, course_list in courses_by_code.items(): + merged = self._merge_course_sections(course_list) + if not merged: + continue + + course_id = merged.get("courseId") + catalog_info = course_details.get(course_id, {}) + prereq_info = prerequisites.get(course_id, []) + official_info = official_data.get(course_code, {}) + + if prereq_info: + courses_with_prereqs += 1 + logger.debug( + f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" + ) - if prereq_info: - courses_with_prereqs += 1 - logger.debug( - f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" + course_data = self._build_course_record( + course_code, merged, catalog_info, prereq_info, official_info ) - course_data = self._build_course_record( - course_code, merged, catalog_info, prereq_info, official_info - ) + if course_data: + integrated_courses.append(course_data) + + # If we only have official data (no course selection data), create courses from official data + elif official_data: + logger.info("Creating courses from official website data only") + for course_code, official_info in official_data.items(): + # Create empty main_data for courses that only exist in official website + empty_main_data = {} + empty_catalog_data = {} + empty_prereq_data = [] + + course_data = self._build_course_record( + course_code, + empty_main_data, + empty_catalog_data, + empty_prereq_data, + official_info, + ) - if course_data: - integrated_courses.append(course_data) + if course_data: + integrated_courses.append(course_data) logger.info( - f"Integration complete: {courses_with_prereqs} courses have prerequisites" + f"Integration complete: {courses_with_prereqs} courses have prerequisites, {len(integrated_courses)} total courses" ) return integrated_courses @@ -428,6 +483,10 @@ def _extract_course_title(self, main_data, catalog_data, official_data=None): """Extract course title (prefer English name)""" if official_data is None: official_data = {} + if main_data is None: + main_data = {} + if catalog_data is None: + catalog_data = {} return ( official_data.get("course_title", "") @@ -442,8 +501,8 @@ def _parse_course_code(self, course_code): number = 0 if course_code: - # Match DEPT####J? (J is optional) - match = re.match(r"^([A-Z]{2,4})(\d{4})J?$", course_code) + # Match DEPT###(#)?J? (3 or 4 digits, J is optional) + match = re.match(r"^([A-Z]{2,4})(\d{3,4})J?$", course_code) if match: department = match.group(1) number = int(match.group(2)) @@ -452,6 +511,11 @@ def _parse_course_code(self, course_code): def _extract_course_credits(self, main_data, catalog_data): """Extract course credits""" + if main_data is None: + main_data = {} + if catalog_data is None: + catalog_data = {} + course_credits = main_data.get("totalCredit", 0) or catalog_data.get( "credit", 0 ) @@ -520,6 +584,11 @@ def _extract_description(self, official_data=None): def _extract_instructors(self, main_data, catalog_data): """Extract and merge instructor information""" + if main_data is None: + main_data = {} + if catalog_data is None: + catalog_data = {} + instructors = main_data.get("all_instructors", []) teacher_name = catalog_data.get("teacherName", "") @@ -532,6 +601,8 @@ def _extract_instructors(self, main_data, catalog_data): def _build_course_url(self, main_data): """Build course detail page URL""" + if main_data is None: + main_data = {} course_id = main_data.get("courseId") return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" From 62ddbe4eaa2e21674b209073badd6ec4db6baf22 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sat, 4 Oct 2025 11:45:50 +0800 Subject: [PATCH 10/20] feat(spider): update async function --- Makefile | 2 +- apps/spider/crawlers/orc.py | 355 ++++++++++++++++++++++++--------- apps/spider/manager.py | 343 ++++++++++++++++++++++++++++++++ pyproject.toml | 1 + uv.lock | 381 +++++++++++++++++++++++++++++++++++- 5 files changed, 992 insertions(+), 90 deletions(-) create mode 100644 apps/spider/manager.py diff --git a/Makefile b/Makefile index 48d26fb..63421ce 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,7 @@ collect: @echo "Collecting Django static files (confirming 'yes')..." echo 'yes' | uv run manage.py collectstatic -format: format-backend format-frontend +format: format-backend @echo "All code formatted successfully!" format-backend: diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 0056dd3..331f994 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -1,14 +1,18 @@ import re import urllib.parse import logging - +import asyncio +import aiohttp import json +from datetime import datetime + import requests import time from collections import defaultdict from urllib.parse import urljoin from apps.spider.utils import retrieve_soup +from apps.spider.manager import CourseDataCache from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM @@ -75,16 +79,43 @@ def _ensure_initialized(self): self._initialized = True logger.info("Crawler initialized successfully!") - def get_all_courses(self): + def get_all_courses(self, use_cache=True, save_cache=True): """ Get all course data from multiple APIs and official website + Args: + use_cache: Whether to use cached data + save_cache: Whether to save data to cache + Returns: list: Course data with prerequisites, descriptions, and instructors """ - # Ask user which data sources to use + cache_manager = CourseDataCache() + + # If using cache, check for available cache files first + if use_cache: + cache_files = cache_manager.list_cache_files() + if cache_files: + print(f"Found {len(cache_files)} cache files") + choice = input("Use existing cache? (y/n/list): ").strip().lower() + + if choice == "list": + # Show cache file list for selection + from apps.spider.manager import interactive_cache_manager + + selected_file = interactive_cache_manager() + if selected_file: + print(f"Loading cache file: {selected_file.name}") + return cache_manager.load_from_jsonl(selected_file) + elif choice in ["y", "yes"]: + # Use the latest cache file + latest_file = cache_files[0] + print(f"Loading latest cache: {latest_file.name}") + return cache_manager.load_from_jsonl(latest_file) + + # Ask user to choose data sources use_coursesel = self._ask_user_choice( - "Crawl course selection system data? (y/n): ", default="y" + "Crawl course selection system data? (y/n): ", default="n" ) use_official = self._ask_user_choice( "Crawl official website data? (y/n): ", default="y" @@ -97,25 +128,58 @@ def get_all_courses(self): if use_coursesel: self._ensure_initialized() # Make sure crawler is initialized - print("Crawling course selection system data...") + print("🌐 爬取课程选择系统数据...") # Get data from course selection APIs courses_data = self._get_lesson_tasks() course_details = self._get_course_catalog() prerequisites = self._get_prerequisites() else: - print("Skipping course selection system data") + print("⏭️ 跳过课程选择系统数据") if use_official: - print("Crawling official website data...") + print("🌐 爬取官网数据...") # Get official website data for enhanced descriptions official_data = self._get_official_website_data() else: print("Skipping official website data") - return self._integrate_course_data( + # Integrate data + integrated_data = self._integrate_course_data( courses_data, course_details, prerequisites, official_data ) + # Save to cache + if save_cache and integrated_data: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + data_sources = [] + if use_coursesel: + data_sources.append("coursesel") + if use_official: + data_sources.append("official") + + cache_filename = f"courses_{'_'.join(data_sources)}_{timestamp}" + cache_filepath = cache_manager.save_to_jsonl( + integrated_data, cache_filename + ) + + print(f"Data cached to: {cache_filepath}") + + # Ask whether to import to database immediately + from apps.spider.manager import preview_data_before_import + + if preview_data_before_import(cache_filepath, limit=5): + print("Starting database import...") + try: + import_department(integrated_data) + print("Data import successful!") + except Exception as e: + print(f"Data import failed: {str(e)}") + print("Data saved to cache, can be imported manually later") + else: + print("Skipping database import, data saved to cache") + + return integrated_data + def _ask_user_choice(self, prompt, default="y"): """Ask user for yes/no choice with default value""" while True: @@ -256,47 +320,90 @@ def _get_official_website_data(self): logger.info("Fetching course data from official website") try: - # Get all course URLs from official website - official_urls = self._get_official_course_urls() - official_data = {} - - for url in official_urls: - course_data = self._crawl_official_course_data(url) - if course_data and course_data.get("course_code"): - official_data[course_data["course_code"]] = course_data - - logger.info(f"Fetched official data for {len(official_data)} courses") - return official_data - + # Run the async crawler + return asyncio.run(self._get_official_website_data_async()) except Exception as e: logger.error(f"Error fetching official website data: {str(e)}") return {} - def _get_official_course_urls(self): - """Get all course URLs from official website""" - try: - soup = retrieve_soup(OFFICIAL_UNDERGRAD_URL) - linked_urls = [ - urljoin(OFFICIAL_BASE_URL, a["href"]) - for a in soup.find_all("a", href=True) + async def _get_official_website_data_async(self): + """Async version of official website data fetching with concurrency""" + # Get all course URLs from official website + official_urls = self._get_official_course_urls() + + if not official_urls: + logger.warning("No official course URLs found") + return {} + + logger.info(f"Found {len(official_urls)} course URLs to crawl") + + # Create aiohttp session with timeout and headers + timeout = aiohttp.ClientTimeout(total=30, connect=10) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session: + # Create semaphore to limit concurrent requests + semaphore = asyncio.Semaphore(10) # Max 10 concurrent requests + + # Create tasks for all URLs + tasks = [ + self._crawl_official_course_data_async(session, semaphore, url) + for url in official_urls ] - return set( - linked_url - for linked_url in linked_urls - if self._is_official_course_url(linked_url) + + # Execute all tasks concurrently with progress tracking + official_data = {} + completed = 0 + total = len(tasks) + + for coro in asyncio.as_completed(tasks): + try: + course_data = await coro + completed += 1 + + if completed % 5 == 0 or completed == total: + logger.info(f"Progress: {completed}/{total} courses crawled") + + if course_data and course_data.get("course_code"): + official_data[course_data["course_code"]] = course_data + + except Exception as e: + logger.warning(f"Failed to crawl one course: {str(e)}") + completed += 1 + + logger.info( + f"Successfully fetched official data for {len(official_data)} courses" ) - except Exception as e: - logger.error(f"Error getting official course URLs: {str(e)}") - return set() + return official_data - def _is_official_course_url(self, candidate_url): - """Check if URL is a valid official course detail URL""" - return candidate_url.startswith(OFFICIAL_COURSE_DETAIL_URL_PREFIX) + async def _crawl_official_course_data_async(self, session, semaphore, course_url): + """Async crawl single course data from official website""" + async with semaphore: # Limit concurrent requests + try: + async with session.get(course_url) as response: + if response.status != 200: + logger.warning(f"HTTP {response.status} for {course_url}") + return None - def _crawl_official_course_data(self, course_url): - """Crawl single course data from official website""" + html_content = await response.text() + return self._parse_official_course_html(html_content, course_url) + + except asyncio.TimeoutError: + logger.warning(f"Timeout for {course_url}") + return None + except Exception as e: + logger.warning(f"Error crawling {course_url}: {str(e)}") + return None + + def _parse_official_course_html(self, html_content, course_url): + """Parse HTML content to extract course data""" try: - soup = retrieve_soup(course_url) + from bs4 import BeautifulSoup + + soup = BeautifulSoup(html_content, "html.parser") + course_heading = soup.find("h2") if not course_heading: return None @@ -309,7 +416,12 @@ def _crawl_official_course_data(self, course_url): if len(split_course_heading) < 2: return None - children = list(soup.find_all(class_="et_pb_text_inner")[3].children) + # Find course content sections + text_inner_sections = soup.find_all(class_="et_pb_text_inner") + if len(text_inner_sections) < 4: + return None + + children = list(text_inner_sections[3].children) course_code = split_course_heading[0] course_title = split_course_heading[1] @@ -324,19 +436,17 @@ def _crawl_official_course_data(self, course_url): description = ( children[i + 2].get_text(strip=True) if i + 2 < len(children) + and hasattr(children[i + 2], "get_text") else "" ) if description == "\n" or "Course Topics" in description: description = "" elif "Course Topics:" in text: - course_topics = ( - [ + if i + 2 < len(children) and hasattr(children[i + 2], "find_all"): + course_topics = [ li.get_text(strip=True) for li in children[i + 2].find_all("li") ] - if i + 2 < len(children) - else [] - ) return { "course_code": course_code, @@ -347,11 +457,38 @@ def _crawl_official_course_data(self, course_url): } except Exception as e: - logger.error( - f"Error crawling official course data from {course_url}: {str(e)}" - ) + logger.warning(f"Error parsing course HTML from {course_url}: {str(e)}") return None + def _get_official_course_urls(self): + """Get all course URLs from official website""" + try: + from bs4 import Tag + + soup = retrieve_soup(OFFICIAL_UNDERGRAD_URL) + linked_urls = [] + + for a in soup.find_all("a", href=True): + # Check if it's a Tag element and has href attribute + if isinstance(a, Tag) and a.has_attr("href"): + href = a["href"] + if href and isinstance(href, str): + full_url = urljoin(OFFICIAL_BASE_URL, href) + linked_urls.append(full_url) + + return { + linked_url + for linked_url in linked_urls + if self._is_official_course_url(linked_url) + } + except Exception as e: + logger.error(f"Error getting official course URLs: {str(e)}") + return set() + + def _is_official_course_url(self, candidate_url): + """Check if URL is a valid official course detail URL""" + return candidate_url.startswith(OFFICIAL_COURSE_DETAIL_URL_PREFIX) + def _integrate_course_data( self, courses_data, course_details, prerequisites, official_data=None ): @@ -608,6 +745,7 @@ def _build_course_url(self, main_data): _crawler = None +_course_data_cache = {} def _get_crawler(): @@ -620,68 +758,109 @@ def _get_crawler(): def crawl_program_urls(): """Get all course URLs (legacy interface)""" + global _course_data_cache + crawler = _get_crawler() courses = crawler.get_all_courses() course_urls = [] - for course in courses: - if course.get("url"): - course_urls.append(course["url"]) - - if not hasattr(crawl_program_urls, "_course_data_cache"): - crawl_program_urls._course_data_cache = {} + _course_data_cache = {} # Reset cache for course in courses: if course.get("url"): - crawl_program_urls._course_data_cache[course["url"]] = course + course_urls.append(course["url"]) + _course_data_cache[course["url"]] = course return course_urls def _crawl_course_data(course_url): """Crawl single course data (legacy interface)""" - if hasattr(crawl_program_urls, "_course_data_cache"): - course_data = crawl_program_urls._course_data_cache.get(course_url) - if course_data: - return course_data + global _course_data_cache + + course_data = _course_data_cache.get(course_url) + if course_data: + return course_data return {} def import_department(department_data): - """Import course data to database""" + """Import course data to database with improved error handling""" + success_count = 0 + error_count = 0 + for course_data in department_data: - # Prepare defaults dict with all available fields - defaults = { - "course_title": course_data["course_title"], - "department": course_data["department"], - "number": course_data["number"], - "course_credits": course_data["course_credits"], - "pre_requisites": course_data["pre_requisites"], - "description": course_data["description"], - "course_topics": course_data["course_topics"], - "url": course_data["url"], - } + try: + # 验证必要字段 + required_fields = ["course_code", "course_title"] + missing_fields = [ + field for field in required_fields if not course_data.get(field) + ] - # Add official_url if available - if "official_url" in course_data: - defaults["official_url"] = course_data["official_url"] + if missing_fields: + logger.warning( + f"Skipping course due to missing required fields: {missing_fields}" + ) + error_count += 1 + continue + + # 准备默认值,处理可能缺失的字段 + defaults = { + "course_title": course_data.get("course_title", ""), + "department": course_data.get("department", ""), + "number": course_data.get("number", 0), + "course_credits": course_data.get("course_credits", 0), + "pre_requisites": course_data.get("pre_requisites", ""), + "description": course_data.get("description", ""), + "course_topics": course_data.get("course_topics", []), + "url": course_data.get("url", ""), + } - course, _ = Course.objects.update_or_create( - course_code=course_data["course_code"], - defaults=defaults, - ) + # 注意:official_url 字段不存在于Course模型中,所以不包含它 - if "instructors" in course_data and course_data["instructors"]: - for instructor_name in course_data["instructors"]: - instructor, _ = Instructor.objects.get_or_create(name=instructor_name) + # 创建或更新课程 + course, created = Course.objects.update_or_create( + course_code=course_data["course_code"], + defaults=defaults, + ) - offering, _ = CourseOffering.objects.get_or_create( - course=course, - term=CURRENT_TERM, - defaults={"section": 1, "period": ""}, - ) - offering.instructors.add(instructor) + # 处理教师信息 + instructors = course_data.get("instructors", []) + if instructors: + for instructor_name in instructors: + if instructor_name.strip(): # 确保教师名字不为空 + try: + instructor, _ = Instructor.objects.get_or_create( + name=instructor_name.strip() + ) + + offering, _ = CourseOffering.objects.get_or_create( + course=course, + term=CURRENT_TERM, + defaults={"section": 1, "period": ""}, + ) + offering.instructors.add(instructor) + except Exception as e: + logger.warning( + f"Error creating instructor {instructor_name}: {str(e)}" + ) + + success_count += 1 + if created: + logger.info(f"Created new course: {course_data['course_code']}") + else: + logger.info(f"Updated course: {course_data['course_code']}") + + except Exception as e: + error_count += 1 + course_code = course_data.get("course_code", "Unknown") + error_msg = str(e) + print(f"Error importing course {course_code}: {error_msg}") + logger.error(f"Error importing course {course_code}: {error_msg}") + + logger.info(f"Import completed: {success_count} successful, {error_count} errors") + return {"success": success_count, "errors": error_count} def extract_prerequisites(pre_requisites): diff --git a/apps/spider/manager.py b/apps/spider/manager.py new file mode 100644 index 0000000..9af7151 --- /dev/null +++ b/apps/spider/manager.py @@ -0,0 +1,343 @@ +""" +Unified spider data management system +Provides interactive cache management, crawler execution, and data import functionality +""" + +import json +import os +import sys +import django +from datetime import datetime +from pathlib import Path + +# Setup Django environment +project_root = Path(__file__).parent.parent.parent +sys.path.append(str(project_root)) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "website.settings") +django.setup() + + +class CourseDataCache: + """Course data cache manager""" + + def __init__(self, cache_dir=None): + if cache_dir is None: + # Default to the new cache directory location + cache_dir = Path(__file__).parent / "crawlers" / "data_cache" + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(exist_ok=True) + + def save_to_jsonl(self, data, data_type, timestamp=None): + """Save data to jsonl file""" + if timestamp is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + filename = f"{data_type}_{timestamp}.jsonl" + filepath = self.cache_dir / filename + + print(f"Saving data to: {filepath}") + print(f"Data count: {len(data) if isinstance(data, list) else 1}") + + with open(filepath, "w", encoding="utf-8") as f: + if isinstance(data, list): + for item in data: + json.dump(item, f, ensure_ascii=False) + f.write("\n") + else: + json.dump(data, f, ensure_ascii=False) + f.write("\n") + + print(f"Data saved to: {filepath}") + return filepath + + def load_from_jsonl(self, filepath): + """Load data from jsonl file""" + data = [] + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + data.append(json.loads(line)) + return data + + def list_cache_files(self): + """List all cache files""" + files = list(self.cache_dir.glob("*.jsonl")) + files.sort(key=lambda x: x.stat().st_mtime, reverse=True) + return files + + def get_cache_info(self, filepath): + """Get cache file information""" + stat = filepath.stat() + data = self.load_from_jsonl(filepath) + + return { + "filename": filepath.name, + "path": str(filepath), + "size": f"{stat.st_size / 1024:.1f} KB", + "modified": datetime.fromtimestamp(stat.st_mtime).strftime( + "%Y-%m-%d %H:%M:%S" + ), + "count": len(data), + "preview": data[:3] if len(data) > 3 else data, # Preview first 3 items + } + + +def interactive_cache_manager(): + """Interactive cache manager""" + cache = CourseDataCache() + + print("=" * 60) + print("Course Data Cache Manager") + print("=" * 60) + + files = cache.list_cache_files() + + if not files: + print("No cache files found") + print("Please run the crawler first to generate data cache") + return None + + print(f"Found {len(files)} cache files:") + print() + + for i, filepath in enumerate(files, 1): + info = cache.get_cache_info(filepath) + print(f"{i}. {info['filename']}") + print(f" Modified: {info['modified']}") + print(f" File size: {info['size']}") + print(f" Data count: {info['count']}") + + if info["preview"]: + print(" Data preview:") + for j, item in enumerate(info["preview"]): + course_code = item.get("course_code", "N/A") + course_title = item.get("course_title", "N/A") + print(f" {j + 1}. {course_code}: {course_title}") + print() + + while True: + try: + choice = input( + f"Select file to import (1-{len(files)}) or 'q' to quit: " + ).strip() + + if choice.lower() == "q": + print("Exiting cache manager") + return None + + file_index = int(choice) - 1 + if 0 <= file_index < len(files): + selected_file = files[file_index] + print(f"Selected file: {selected_file.name}") + return selected_file + else: + print(f"Please enter a number between 1-{len(files)}") + + except ValueError: + print("Please enter a valid number") + except KeyboardInterrupt: + print("\nExiting cache manager") + return None + + +def preview_data_before_import(filepath, limit=10): + """Preview data before import""" + cache = CourseDataCache() + data = cache.load_from_jsonl(filepath) + + print("=" * 60) + print(f"Data Preview ({filepath.name})") + print("=" * 60) + print(f"Total data count: {len(data)}") + print(f"Previewing first {min(limit, len(data))} items:") + print() + + for i, item in enumerate(data[:limit], 1): + print(f"{i}. Course code: {item.get('course_code', 'N/A')}") + print(f" Course title: {item.get('course_title', 'N/A')}") + print(f" Credits: {item.get('course_credits', 'N/A')}") + print(f" Department: {item.get('department', 'N/A')}") + print(f" Description length: {len(item.get('description', ''))}") + print(f" Instructors: {', '.join(item.get('instructors', []))}") + print() + + if len(data) > limit: + print(f"... and {len(data) - limit} more items") + + print("=" * 60) + + while True: + try: + confirm = ( + input("Confirm import these data to database? (y/n): ").strip().lower() + ) + if confirm in ["y", "yes"]: + return True + elif confirm in ["n", "no"]: + return False + else: + print("Please enter y or n") + except KeyboardInterrupt: + print("\nImport cancelled") + return False + + +def main_menu(): + """Main interactive menu for spider data management""" + print("\n" + "=" * 60) + print("Spider Data Management System") + print("=" * 60) + print("1. Run Crawler (Fetch new data)") + print("2. Manage Cache Files") + print("3. Import from Cache to Database") + print("4. Clean Cache Files") + print("5. Exit") + print("=" * 60) + + while True: + try: + choice = input("Select operation (1-5): ").strip() + + if choice == "1": + run_crawler() + elif choice == "2": + manage_cache() + elif choice == "3": + import_from_cache() + elif choice == "4": + clean_cache() + elif choice == "5": + print("Goodbye!") + break + else: + print("Please enter a number between 1-5") + + except (ValueError, KeyboardInterrupt): + print("\nProgram interrupted") + break + except Exception as e: + print(f"Error occurred: {str(e)}") + + +def run_crawler(): + """Run crawler to fetch new data""" + print("\n" + "=" * 60) + print("Run Crawler") + print("=" * 60) + + try: + from apps.spider.crawlers.orc import CourseSelCrawler + + crawler = CourseSelCrawler() + data = crawler.get_all_courses(use_cache=False, save_cache=True) + + print(f"Crawler execution completed, collected {len(data)} courses") + + except Exception as e: + print(f"Crawler execution failed: {str(e)}") + + +def manage_cache(): + """Manage and view cache files""" + print("\n" + "=" * 60) + print("Manage Cache Files") + print("=" * 60) + + cache = CourseDataCache() + files = cache.list_cache_files() + + if not files: + print("No cache files found") + return + + print(f"Found {len(files)} cache files:") + + for i, filepath in enumerate(files, 1): + info = cache.get_cache_info(filepath) + print(f"\n{i}. {info['filename']}") + print(f" Modified: {info['modified']}") + print(f" Size: {info['size']}") + print(f" Count: {info['count']}") + + # Show preview + if info["preview"]: + print(" Preview:") + for j, item in enumerate(info["preview"]): + course_code = item.get("course_code", "N/A") + course_title = item.get("course_title", "N/A") + print(f" {j + 1}. {course_code}: {course_title}") + + +def import_from_cache(): + """Import data from cache to database""" + print("\n" + "=" * 60) + print("Import from Cache") + print("=" * 60) + + selected_file = interactive_cache_manager() + if not selected_file: + return + + try: + cache = CourseDataCache() + data = cache.load_from_jsonl(selected_file) + + # Preview and confirm import + if preview_data_before_import(selected_file, limit=10): + print("Starting database import...") + + from apps.spider.crawlers.orc import import_department + + # Use batch import and get statistics + result = import_department(data) + + print("\nImport completed!") + print(f"Success: {result['success']} items") + print(f"Failed: {result['errors']} items") + + else: + print("Import cancelled") + + except Exception as e: + print(f"Error during import process: {str(e)}") + + +def clean_cache(): + """Clean cache files""" + print("\n" + "=" * 60) + print("Clean Cache Files") + print("=" * 60) + + cache = CourseDataCache() + files = cache.list_cache_files() + + if not files: + print("No cache files found") + return + + print(f"Found {len(files)} cache files") + + choice = input("Delete all cache files? (y/n): ").strip().lower() + + if choice in ["y", "yes"]: + deleted_count = 0 + for filepath in files: + try: + filepath.unlink() + deleted_count += 1 + print(f"Deleted: {filepath.name}") + except Exception as e: + print(f"Failed to delete {filepath.name}: {str(e)}") + + print(f"Cleanup completed, deleted {deleted_count} files") + else: + print("Cleanup cancelled") + + +if __name__ == "__main__": + try: + main_menu() + except KeyboardInterrupt: + print("\nProgram exited") + except Exception as e: + print(f"Program error: {str(e)}") diff --git a/pyproject.toml b/pyproject.toml index 3f3dfb5..85e9d32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "pytz>=2025.1", "redis>=5.2.1", "requests>=2.32.3", + "aiohttp>=3.10.0", "bpython>=0.25", "ptpython>=3.0.29", "djangorestframework>=3.16.0", diff --git a/uv.lock b/uv.lock index e845ede..2317c62 100644 --- a/uv.lock +++ b/uv.lock @@ -1,12 +1,13 @@ version = 1 revision = 3 -requires-python = ">=3.13" +requires-python = ">=3.12" [[package]] name = "" version = "0.0.1" source = { virtual = "." } dependencies = [ + { name = "aiohttp" }, { name = "beautifulsoup4" }, { name = "bpython" }, { name = "celery" }, @@ -36,6 +37,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "aiohttp", specifier = ">=3.10.0" }, { name = "beautifulsoup4", specifier = ">=4.13.3" }, { name = "bpython", specifier = ">=0.25" }, { name = "celery", specifier = ">=5.4.0" }, @@ -61,6 +63,79 @@ requires-dist = [ [package.metadata.requires-dev] dev = [{ name = "pre-commit", specifier = ">=4.3.0" }] +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.12.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/97/77cb2450d9b35f517d6cf506256bf4f5bda3f93a66b4ad64ba7fc917899c/aiohttp-3.12.15-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:802d3868f5776e28f7bf69d349c26fc0efadb81676d0afa88ed00d98a26340b7", size = 702333, upload-time = "2025-07-29T05:50:46.507Z" }, + { url = "https://files.pythonhosted.org/packages/83/6d/0544e6b08b748682c30b9f65640d006e51f90763b41d7c546693bc22900d/aiohttp-3.12.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2800614cd560287be05e33a679638e586a2d7401f4ddf99e304d98878c29444", size = 476948, upload-time = "2025-07-29T05:50:48.067Z" }, + { url = "https://files.pythonhosted.org/packages/3a/1d/c8c40e611e5094330284b1aea8a4b02ca0858f8458614fa35754cab42b9c/aiohttp-3.12.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8466151554b593909d30a0a125d638b4e5f3836e5aecde85b66b80ded1cb5b0d", size = 469787, upload-time = "2025-07-29T05:50:49.669Z" }, + { url = "https://files.pythonhosted.org/packages/38/7d/b76438e70319796bfff717f325d97ce2e9310f752a267bfdf5192ac6082b/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e5a495cb1be69dae4b08f35a6c4579c539e9b5706f606632102c0f855bcba7c", size = 1716590, upload-time = "2025-07-29T05:50:51.368Z" }, + { url = "https://files.pythonhosted.org/packages/79/b1/60370d70cdf8b269ee1444b390cbd72ce514f0d1cd1a715821c784d272c9/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6404dfc8cdde35c69aaa489bb3542fb86ef215fc70277c892be8af540e5e21c0", size = 1699241, upload-time = "2025-07-29T05:50:53.628Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2b/4968a7b8792437ebc12186db31523f541943e99bda8f30335c482bea6879/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ead1c00f8521a5c9070fcb88f02967b1d8a0544e6d85c253f6968b785e1a2ab", size = 1754335, upload-time = "2025-07-29T05:50:55.394Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c1/49524ed553f9a0bec1a11fac09e790f49ff669bcd14164f9fab608831c4d/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6990ef617f14450bc6b34941dba4f12d5613cbf4e33805932f853fbd1cf18bfb", size = 1800491, upload-time = "2025-07-29T05:50:57.202Z" }, + { url = "https://files.pythonhosted.org/packages/de/5e/3bf5acea47a96a28c121b167f5ef659cf71208b19e52a88cdfa5c37f1fcc/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd736ed420f4db2b8148b52b46b88ed038d0354255f9a73196b7bbce3ea97545", size = 1719929, upload-time = "2025-07-29T05:50:59.192Z" }, + { url = "https://files.pythonhosted.org/packages/39/94/8ae30b806835bcd1cba799ba35347dee6961a11bd507db634516210e91d8/aiohttp-3.12.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c5092ce14361a73086b90c6efb3948ffa5be2f5b6fbcf52e8d8c8b8848bb97c", size = 1635733, upload-time = "2025-07-29T05:51:01.394Z" }, + { url = "https://files.pythonhosted.org/packages/7a/46/06cdef71dd03acd9da7f51ab3a9107318aee12ad38d273f654e4f981583a/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aaa2234bb60c4dbf82893e934d8ee8dea30446f0647e024074237a56a08c01bd", size = 1696790, upload-time = "2025-07-29T05:51:03.657Z" }, + { url = "https://files.pythonhosted.org/packages/02/90/6b4cfaaf92ed98d0ec4d173e78b99b4b1a7551250be8937d9d67ecb356b4/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6d86a2fbdd14192e2f234a92d3b494dd4457e683ba07e5905a0b3ee25389ac9f", size = 1718245, upload-time = "2025-07-29T05:51:05.911Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e6/2593751670fa06f080a846f37f112cbe6f873ba510d070136a6ed46117c6/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a041e7e2612041a6ddf1c6a33b883be6a421247c7afd47e885969ee4cc58bd8d", size = 1658899, upload-time = "2025-07-29T05:51:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/8f/28/c15bacbdb8b8eb5bf39b10680d129ea7410b859e379b03190f02fa104ffd/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5015082477abeafad7203757ae44299a610e89ee82a1503e3d4184e6bafdd519", size = 1738459, upload-time = "2025-07-29T05:51:09.56Z" }, + { url = "https://files.pythonhosted.org/packages/00/de/c269cbc4faa01fb10f143b1670633a8ddd5b2e1ffd0548f7aa49cb5c70e2/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:56822ff5ddfd1b745534e658faba944012346184fbfe732e0d6134b744516eea", size = 1766434, upload-time = "2025-07-29T05:51:11.423Z" }, + { url = "https://files.pythonhosted.org/packages/52/b0/4ff3abd81aa7d929b27d2e1403722a65fc87b763e3a97b3a2a494bfc63bc/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b2acbbfff69019d9014508c4ba0401822e8bae5a5fdc3b6814285b71231b60f3", size = 1726045, upload-time = "2025-07-29T05:51:13.689Z" }, + { url = "https://files.pythonhosted.org/packages/71/16/949225a6a2dd6efcbd855fbd90cf476052e648fb011aa538e3b15b89a57a/aiohttp-3.12.15-cp312-cp312-win32.whl", hash = "sha256:d849b0901b50f2185874b9a232f38e26b9b3d4810095a7572eacea939132d4e1", size = 423591, upload-time = "2025-07-29T05:51:15.452Z" }, + { url = "https://files.pythonhosted.org/packages/2b/d8/fa65d2a349fe938b76d309db1a56a75c4fb8cc7b17a398b698488a939903/aiohttp-3.12.15-cp312-cp312-win_amd64.whl", hash = "sha256:b390ef5f62bb508a9d67cb3bba9b8356e23b3996da7062f1a57ce1a79d2b3d34", size = 450266, upload-time = "2025-07-29T05:51:17.239Z" }, + { url = "https://files.pythonhosted.org/packages/f2/33/918091abcf102e39d15aba2476ad9e7bd35ddb190dcdd43a854000d3da0d/aiohttp-3.12.15-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9f922ffd05034d439dde1c77a20461cf4a1b0831e6caa26151fe7aa8aaebc315", size = 696741, upload-time = "2025-07-29T05:51:19.021Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2a/7495a81e39a998e400f3ecdd44a62107254803d1681d9189be5c2e4530cd/aiohttp-3.12.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ee8a8ac39ce45f3e55663891d4b1d15598c157b4d494a4613e704c8b43112cd", size = 474407, upload-time = "2025-07-29T05:51:21.165Z" }, + { url = "https://files.pythonhosted.org/packages/49/fc/a9576ab4be2dcbd0f73ee8675d16c707cfc12d5ee80ccf4015ba543480c9/aiohttp-3.12.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3eae49032c29d356b94eee45a3f39fdf4b0814b397638c2f718e96cfadf4c4e4", size = 466703, upload-time = "2025-07-29T05:51:22.948Z" }, + { url = "https://files.pythonhosted.org/packages/09/2f/d4bcc8448cf536b2b54eed48f19682031ad182faa3a3fee54ebe5b156387/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b97752ff12cc12f46a9b20327104448042fce5c33a624f88c18f66f9368091c7", size = 1705532, upload-time = "2025-07-29T05:51:25.211Z" }, + { url = "https://files.pythonhosted.org/packages/f1/f3/59406396083f8b489261e3c011aa8aee9df360a96ac8fa5c2e7e1b8f0466/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:894261472691d6fe76ebb7fcf2e5870a2ac284c7406ddc95823c8598a1390f0d", size = 1686794, upload-time = "2025-07-29T05:51:27.145Z" }, + { url = "https://files.pythonhosted.org/packages/dc/71/164d194993a8d114ee5656c3b7ae9c12ceee7040d076bf7b32fb98a8c5c6/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fa5d9eb82ce98959fc1031c28198b431b4d9396894f385cb63f1e2f3f20ca6b", size = 1738865, upload-time = "2025-07-29T05:51:29.366Z" }, + { url = "https://files.pythonhosted.org/packages/1c/00/d198461b699188a93ead39cb458554d9f0f69879b95078dce416d3209b54/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0fa751efb11a541f57db59c1dd821bec09031e01452b2b6217319b3a1f34f3d", size = 1788238, upload-time = "2025-07-29T05:51:31.285Z" }, + { url = "https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5346b93e62ab51ee2a9d68e8f73c7cf96ffb73568a23e683f931e52450e4148d", size = 1710566, upload-time = "2025-07-29T05:51:33.219Z" }, + { url = "https://files.pythonhosted.org/packages/59/e4/16a8eac9df39b48ae102ec030fa9f726d3570732e46ba0c592aeeb507b93/aiohttp-3.12.15-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:049ec0360f939cd164ecbfd2873eaa432613d5e77d6b04535e3d1fbae5a9e645", size = 1624270, upload-time = "2025-07-29T05:51:35.195Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f8/cd84dee7b6ace0740908fd0af170f9fab50c2a41ccbc3806aabcb1050141/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b52dcf013b57464b6d1e51b627adfd69a8053e84b7103a7cd49c030f9ca44461", size = 1677294, upload-time = "2025-07-29T05:51:37.215Z" }, + { url = "https://files.pythonhosted.org/packages/ce/42/d0f1f85e50d401eccd12bf85c46ba84f947a84839c8a1c2c5f6e8ab1eb50/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b2af240143dd2765e0fb661fd0361a1b469cab235039ea57663cda087250ea9", size = 1708958, upload-time = "2025-07-29T05:51:39.328Z" }, + { url = "https://files.pythonhosted.org/packages/d5/6b/f6fa6c5790fb602538483aa5a1b86fcbad66244997e5230d88f9412ef24c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ac77f709a2cde2cc71257ab2d8c74dd157c67a0558a0d2799d5d571b4c63d44d", size = 1651553, upload-time = "2025-07-29T05:51:41.356Z" }, + { url = "https://files.pythonhosted.org/packages/04/36/a6d36ad545fa12e61d11d1932eef273928b0495e6a576eb2af04297fdd3c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:47f6b962246f0a774fbd3b6b7be25d59b06fdb2f164cf2513097998fc6a29693", size = 1727688, upload-time = "2025-07-29T05:51:43.452Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c8/f195e5e06608a97a4e52c5d41c7927301bf757a8e8bb5bbf8cef6c314961/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:760fb7db442f284996e39cf9915a94492e1896baac44f06ae551974907922b64", size = 1761157, upload-time = "2025-07-29T05:51:45.643Z" }, + { url = "https://files.pythonhosted.org/packages/05/6a/ea199e61b67f25ba688d3ce93f63b49b0a4e3b3d380f03971b4646412fc6/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad702e57dc385cae679c39d318def49aef754455f237499d5b99bea4ef582e51", size = 1710050, upload-time = "2025-07-29T05:51:48.203Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2e/ffeb7f6256b33635c29dbed29a22a723ff2dd7401fff42ea60cf2060abfb/aiohttp-3.12.15-cp313-cp313-win32.whl", hash = "sha256:f813c3e9032331024de2eb2e32a88d86afb69291fbc37a3a3ae81cc9917fb3d0", size = 422647, upload-time = "2025-07-29T05:51:50.718Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8e/78ee35774201f38d5e1ba079c9958f7629b1fd079459aea9467441dbfbf5/aiohttp-3.12.15-cp313-cp313-win_amd64.whl", hash = "sha256:1a649001580bdb37c6fdb1bebbd7e3bc688e8ec2b5c6f52edbb664662b17dc84", size = 449067, upload-time = "2025-07-29T05:51:52.549Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + [[package]] name = "amqp" version = "5.3.1" @@ -100,6 +175,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/e3/893e8757be2612e6c266d9bb58ad2e3651524b5b40cf56761e985a28b13e/asgiref-3.8.1-py3-none-any.whl", hash = "sha256:3e1e3ecc849832fe52ccf2cb6686b7a55f82bb1d6aee72a58826471390335e47", size = 23828, upload-time = "2024-03-22T14:39:34.521Z" }, ] +[[package]] +name = "attrs" +version = "25.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload-time = "2025-03-13T11:10:22.779Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" }, +] + [[package]] name = "beautifulsoup4" version = "4.13.4" @@ -195,6 +279,19 @@ version = "3.4.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188, upload-time = "2024-12-24T18:12:35.43Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105, upload-time = "2024-12-24T18:10:38.83Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404, upload-time = "2024-12-24T18:10:44.272Z" }, + { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423, upload-time = "2024-12-24T18:10:45.492Z" }, + { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184, upload-time = "2024-12-24T18:10:47.898Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268, upload-time = "2024-12-24T18:10:50.589Z" }, + { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601, upload-time = "2024-12-24T18:10:52.541Z" }, + { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098, upload-time = "2024-12-24T18:10:53.789Z" }, + { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520, upload-time = "2024-12-24T18:10:55.048Z" }, + { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852, upload-time = "2024-12-24T18:10:57.647Z" }, + { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488, upload-time = "2024-12-24T18:10:59.43Z" }, + { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192, upload-time = "2024-12-24T18:11:00.676Z" }, + { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550, upload-time = "2024-12-24T18:11:01.952Z" }, + { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785, upload-time = "2024-12-24T18:11:03.142Z" }, { url = "https://files.pythonhosted.org/packages/38/94/ce8e6f63d18049672c76d07d119304e1e2d7c6098f0841b51c666e9f44a0/charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda", size = 195698, upload-time = "2024-12-24T18:11:05.834Z" }, { url = "https://files.pythonhosted.org/packages/24/2e/dfdd9770664aae179a96561cc6952ff08f9a8cd09a908f259a9dfa063568/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313", size = 140162, upload-time = "2024-12-24T18:11:07.064Z" }, { url = "https://files.pythonhosted.org/packages/24/4e/f646b9093cff8fc86f2d60af2de4dc17c759de9d554f130b140ea4738ca6/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9", size = 150263, upload-time = "2024-12-24T18:11:08.374Z" }, @@ -310,6 +407,13 @@ version = "0.1.10" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/23/76/03fc9fb3441a13e9208bb6103ebb7200eba7647d040008b8303a1c03e152/cwcwidth-0.1.10.tar.gz", hash = "sha256:7468760f72c1f4107be1b2b2854bc000401ea36a69daed36fb966a1e19a7a124", size = 60265, upload-time = "2025-02-09T21:15:28.452Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/87/28/8e2ab81f0116bfcec22069e4c92fda9d05b0512605ccef00b62d93719ded/cwcwidth-0.1.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1d2b21ff2eb60c6793349b7fb161c40a8583a57ec32e61f47aab7938177bfdec", size = 23031, upload-time = "2025-02-09T21:14:59.01Z" }, + { url = "https://files.pythonhosted.org/packages/3a/a4/5adc535e2a714ecc926ea701e821a9abbe14f65cae4d615d20059b9b52a5/cwcwidth-0.1.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0316488349c3e5ca4b20de7daa1cb8e96a05d1d14d040d46e87a495da655f4a", size = 101219, upload-time = "2025-02-09T21:15:00.079Z" }, + { url = "https://files.pythonhosted.org/packages/78/4c/18a5a06aa8db3cc28712ab957671e7718aedfc73403d84b0c2cb5cfcbc27/cwcwidth-0.1.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:848b6ffca1e32e28d2ccbb2cd395ccd3c38a7c4ec110728cd9d828eaf609b09e", size = 106565, upload-time = "2025-02-09T21:15:02.081Z" }, + { url = "https://files.pythonhosted.org/packages/06/40/801cba5ccb9551c862ad210eba22031e4655cd74711e32756b7ce24fc751/cwcwidth-0.1.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c3a7bfe1da478c0c27c549f68c6e28a583413da3ee451854ec2d983497bd18b8", size = 102244, upload-time = "2025-02-09T21:15:04.003Z" }, + { url = "https://files.pythonhosted.org/packages/e4/ed/60f61274fcfd0621a45e9403502e8f46968d562810a4424e5ff8d6bd50b0/cwcwidth-0.1.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cff03100f49170bc50fc399d05a31b8fcb7b0cef26df1a8068fa943387107f6c", size = 105634, upload-time = "2025-02-09T21:15:06.005Z" }, + { url = "https://files.pythonhosted.org/packages/b1/27/8179cecd688fef894dda601455d35066adfa3d58af4e97c5ab112893b5f6/cwcwidth-0.1.10-cp312-cp312-win32.whl", hash = "sha256:2dd9a92fdfbc53fc79f0953f39708dcf743fd27450c374985f419e3d47eb89d4", size = 23507, upload-time = "2025-02-09T21:15:07.968Z" }, + { url = "https://files.pythonhosted.org/packages/b2/b4/b7fe652a4d96f03ef051fff8313dfe827bc31578f7e67f1c98d5a5813f66/cwcwidth-0.1.10-cp312-cp312-win_amd64.whl", hash = "sha256:734d764281e3d87c40d0265543f00a653409145fa9f48a93bc0fbf9a8e7932ca", size = 26100, upload-time = "2025-02-09T21:15:09.186Z" }, { url = "https://files.pythonhosted.org/packages/af/f7/8c4cfe0b08053eea4da585ad5e12fef7cd11a0c9e4603ac8644c2a0b04b5/cwcwidth-0.1.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2391073280d774ab5d9af1d3aaa26ec456956d04daa1134fb71c31cd72ba5bba", size = 22344, upload-time = "2025-02-09T21:15:10.136Z" }, { url = "https://files.pythonhosted.org/packages/2a/48/176bbaf56520c5d6b72cbbe0d46821989eaa30df628daa5baecdd7f35458/cwcwidth-0.1.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bfbdc2943631ec770ee781b35b8876fa7e283ff2273f944e2a9ae1f3df4ecdf", size = 94907, upload-time = "2025-02-09T21:15:11.178Z" }, { url = "https://files.pythonhosted.org/packages/bc/fc/4dfed13b316a67bf2419a63db53566e3e5e4d4fc5a94ef493d3334be3c1f/cwcwidth-0.1.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb0103c7db8d86e260e016ff89f8f00ef5eb75c481abc346bfaa756da9f976b4", size = 100046, upload-time = "2025-02-09T21:15:12.279Z" }, @@ -469,12 +573,81 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] +[[package]] +name = "frozenlist" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a2/c8131383f1e66adad5f6ecfcce383d584ca94055a34d683bbb24ac5f2f1c/frozenlist-1.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3dbf9952c4bb0e90e98aec1bd992b3318685005702656bc6f67c1a32b76787f2", size = 81424, upload-time = "2025-06-09T23:00:42.24Z" }, + { url = "https://files.pythonhosted.org/packages/4c/9d/02754159955088cb52567337d1113f945b9e444c4960771ea90eb73de8db/frozenlist-1.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1f5906d3359300b8a9bb194239491122e6cf1444c2efb88865426f170c262cdb", size = 47952, upload-time = "2025-06-09T23:00:43.481Z" }, + { url = "https://files.pythonhosted.org/packages/01/7a/0046ef1bd6699b40acd2067ed6d6670b4db2f425c56980fa21c982c2a9db/frozenlist-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3dabd5a8f84573c8d10d8859a50ea2dec01eea372031929871368c09fa103478", size = 46688, upload-time = "2025-06-09T23:00:44.793Z" }, + { url = "https://files.pythonhosted.org/packages/d6/a2/a910bafe29c86997363fb4c02069df4ff0b5bc39d33c5198b4e9dd42d8f8/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa57daa5917f1738064f302bf2626281a1cb01920c32f711fbc7bc36111058a8", size = 243084, upload-time = "2025-06-09T23:00:46.125Z" }, + { url = "https://files.pythonhosted.org/packages/64/3e/5036af9d5031374c64c387469bfcc3af537fc0f5b1187d83a1cf6fab1639/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c193dda2b6d49f4c4398962810fa7d7c78f032bf45572b3e04dd5249dff27e08", size = 233524, upload-time = "2025-06-09T23:00:47.73Z" }, + { url = "https://files.pythonhosted.org/packages/06/39/6a17b7c107a2887e781a48ecf20ad20f1c39d94b2a548c83615b5b879f28/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe2b675cf0aaa6d61bf8fbffd3c274b3c9b7b1623beb3809df8a81399a4a9c4", size = 248493, upload-time = "2025-06-09T23:00:49.742Z" }, + { url = "https://files.pythonhosted.org/packages/be/00/711d1337c7327d88c44d91dd0f556a1c47fb99afc060ae0ef66b4d24793d/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8fc5d5cda37f62b262405cf9652cf0856839c4be8ee41be0afe8858f17f4c94b", size = 244116, upload-time = "2025-06-09T23:00:51.352Z" }, + { url = "https://files.pythonhosted.org/packages/24/fe/74e6ec0639c115df13d5850e75722750adabdc7de24e37e05a40527ca539/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0d5ce521d1dd7d620198829b87ea002956e4319002ef0bc8d3e6d045cb4646e", size = 224557, upload-time = "2025-06-09T23:00:52.855Z" }, + { url = "https://files.pythonhosted.org/packages/8d/db/48421f62a6f77c553575201e89048e97198046b793f4a089c79a6e3268bd/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:488d0a7d6a0008ca0db273c542098a0fa9e7dfaa7e57f70acef43f32b3f69dca", size = 241820, upload-time = "2025-06-09T23:00:54.43Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fa/cb4a76bea23047c8462976ea7b7a2bf53997a0ca171302deae9d6dd12096/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:15a7eaba63983d22c54d255b854e8108e7e5f3e89f647fc854bd77a237e767df", size = 236542, upload-time = "2025-06-09T23:00:56.409Z" }, + { url = "https://files.pythonhosted.org/packages/5d/32/476a4b5cfaa0ec94d3f808f193301debff2ea42288a099afe60757ef6282/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1eaa7e9c6d15df825bf255649e05bd8a74b04a4d2baa1ae46d9c2d00b2ca2cb5", size = 249350, upload-time = "2025-06-09T23:00:58.468Z" }, + { url = "https://files.pythonhosted.org/packages/8d/ba/9a28042f84a6bf8ea5dbc81cfff8eaef18d78b2a1ad9d51c7bc5b029ad16/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4389e06714cfa9d47ab87f784a7c5be91d3934cd6e9a7b85beef808297cc025", size = 225093, upload-time = "2025-06-09T23:01:00.015Z" }, + { url = "https://files.pythonhosted.org/packages/bc/29/3a32959e68f9cf000b04e79ba574527c17e8842e38c91d68214a37455786/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:73bd45e1488c40b63fe5a7df892baf9e2a4d4bb6409a2b3b78ac1c6236178e01", size = 245482, upload-time = "2025-06-09T23:01:01.474Z" }, + { url = "https://files.pythonhosted.org/packages/80/e8/edf2f9e00da553f07f5fa165325cfc302dead715cab6ac8336a5f3d0adc2/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99886d98e1643269760e5fe0df31e5ae7050788dd288947f7f007209b8c33f08", size = 249590, upload-time = "2025-06-09T23:01:02.961Z" }, + { url = "https://files.pythonhosted.org/packages/1c/80/9a0eb48b944050f94cc51ee1c413eb14a39543cc4f760ed12657a5a3c45a/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:290a172aae5a4c278c6da8a96222e6337744cd9c77313efe33d5670b9f65fc43", size = 237785, upload-time = "2025-06-09T23:01:05.095Z" }, + { url = "https://files.pythonhosted.org/packages/f3/74/87601e0fb0369b7a2baf404ea921769c53b7ae00dee7dcfe5162c8c6dbf0/frozenlist-1.7.0-cp312-cp312-win32.whl", hash = "sha256:426c7bc70e07cfebc178bc4c2bf2d861d720c4fff172181eeb4a4c41d4ca2ad3", size = 39487, upload-time = "2025-06-09T23:01:06.54Z" }, + { url = "https://files.pythonhosted.org/packages/0b/15/c026e9a9fc17585a9d461f65d8593d281fedf55fbf7eb53f16c6df2392f9/frozenlist-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:563b72efe5da92e02eb68c59cb37205457c977aa7a449ed1b37e6939e5c47c6a", size = 43874, upload-time = "2025-06-09T23:01:07.752Z" }, + { url = "https://files.pythonhosted.org/packages/24/90/6b2cebdabdbd50367273c20ff6b57a3dfa89bd0762de02c3a1eb42cb6462/frozenlist-1.7.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee80eeda5e2a4e660651370ebffd1286542b67e268aa1ac8d6dbe973120ef7ee", size = 79791, upload-time = "2025-06-09T23:01:09.368Z" }, + { url = "https://files.pythonhosted.org/packages/83/2e/5b70b6a3325363293fe5fc3ae74cdcbc3e996c2a11dde2fd9f1fb0776d19/frozenlist-1.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d1a81c85417b914139e3a9b995d4a1c84559afc839a93cf2cb7f15e6e5f6ed2d", size = 47165, upload-time = "2025-06-09T23:01:10.653Z" }, + { url = "https://files.pythonhosted.org/packages/f4/25/a0895c99270ca6966110f4ad98e87e5662eab416a17e7fd53c364bf8b954/frozenlist-1.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cbb65198a9132ebc334f237d7b0df163e4de83fb4f2bdfe46c1e654bdb0c5d43", size = 45881, upload-time = "2025-06-09T23:01:12.296Z" }, + { url = "https://files.pythonhosted.org/packages/19/7c/71bb0bbe0832793c601fff68cd0cf6143753d0c667f9aec93d3c323f4b55/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dab46c723eeb2c255a64f9dc05b8dd601fde66d6b19cdb82b2e09cc6ff8d8b5d", size = 232409, upload-time = "2025-06-09T23:01:13.641Z" }, + { url = "https://files.pythonhosted.org/packages/c0/45/ed2798718910fe6eb3ba574082aaceff4528e6323f9a8570be0f7028d8e9/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6aeac207a759d0dedd2e40745575ae32ab30926ff4fa49b1635def65806fddee", size = 225132, upload-time = "2025-06-09T23:01:15.264Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e2/8417ae0f8eacb1d071d4950f32f229aa6bf68ab69aab797b72a07ea68d4f/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bd8c4e58ad14b4fa7802b8be49d47993182fdd4023393899632c88fd8cd994eb", size = 237638, upload-time = "2025-06-09T23:01:16.752Z" }, + { url = "https://files.pythonhosted.org/packages/f8/b7/2ace5450ce85f2af05a871b8c8719b341294775a0a6c5585d5e6170f2ce7/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04fb24d104f425da3540ed83cbfc31388a586a7696142004c577fa61c6298c3f", size = 233539, upload-time = "2025-06-09T23:01:18.202Z" }, + { url = "https://files.pythonhosted.org/packages/46/b9/6989292c5539553dba63f3c83dc4598186ab2888f67c0dc1d917e6887db6/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a5c505156368e4ea6b53b5ac23c92d7edc864537ff911d2fb24c140bb175e60", size = 215646, upload-time = "2025-06-09T23:01:19.649Z" }, + { url = "https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bd7eb96a675f18aa5c553eb7ddc24a43c8c18f22e1f9925528128c052cdbe00", size = 232233, upload-time = "2025-06-09T23:01:21.175Z" }, + { url = "https://files.pythonhosted.org/packages/59/52/460db4d7ba0811b9ccb85af996019f5d70831f2f5f255f7cc61f86199795/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:05579bf020096fe05a764f1f84cd104a12f78eaab68842d036772dc6d4870b4b", size = 227996, upload-time = "2025-06-09T23:01:23.098Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c9/f4b39e904c03927b7ecf891804fd3b4df3db29b9e487c6418e37988d6e9d/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:376b6222d114e97eeec13d46c486facd41d4f43bab626b7c3f6a8b4e81a5192c", size = 242280, upload-time = "2025-06-09T23:01:24.808Z" }, + { url = "https://files.pythonhosted.org/packages/b8/33/3f8d6ced42f162d743e3517781566b8481322be321b486d9d262adf70bfb/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0aa7e176ebe115379b5b1c95b4096fb1c17cce0847402e227e712c27bdb5a949", size = 217717, upload-time = "2025-06-09T23:01:26.28Z" }, + { url = "https://files.pythonhosted.org/packages/3e/e8/ad683e75da6ccef50d0ab0c2b2324b32f84fc88ceee778ed79b8e2d2fe2e/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3fbba20e662b9c2130dc771e332a99eff5da078b2b2648153a40669a6d0e36ca", size = 236644, upload-time = "2025-06-09T23:01:27.887Z" }, + { url = "https://files.pythonhosted.org/packages/b2/14/8d19ccdd3799310722195a72ac94ddc677541fb4bef4091d8e7775752360/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f3f4410a0a601d349dd406b5713fec59b4cee7e71678d5b17edda7f4655a940b", size = 238879, upload-time = "2025-06-09T23:01:29.524Z" }, + { url = "https://files.pythonhosted.org/packages/ce/13/c12bf657494c2fd1079a48b2db49fa4196325909249a52d8f09bc9123fd7/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e2cdfaaec6a2f9327bf43c933c0319a7c429058e8537c508964a133dffee412e", size = 232502, upload-time = "2025-06-09T23:01:31.287Z" }, + { url = "https://files.pythonhosted.org/packages/d7/8b/e7f9dfde869825489382bc0d512c15e96d3964180c9499efcec72e85db7e/frozenlist-1.7.0-cp313-cp313-win32.whl", hash = "sha256:5fc4df05a6591c7768459caba1b342d9ec23fa16195e744939ba5914596ae3e1", size = 39169, upload-time = "2025-06-09T23:01:35.503Z" }, + { url = "https://files.pythonhosted.org/packages/35/89/a487a98d94205d85745080a37860ff5744b9820a2c9acbcdd9440bfddf98/frozenlist-1.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:52109052b9791a3e6b5d1b65f4b909703984b770694d3eb64fad124c835d7cba", size = 43219, upload-time = "2025-06-09T23:01:36.784Z" }, + { url = "https://files.pythonhosted.org/packages/56/d5/5c4cf2319a49eddd9dd7145e66c4866bdc6f3dbc67ca3d59685149c11e0d/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a6f86e4193bb0e235ef6ce3dde5cbabed887e0b11f516ce8a0f4d3b33078ec2d", size = 84345, upload-time = "2025-06-09T23:01:38.295Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/ec2c1e1dc16b85bc9d526009961953df9cec8481b6886debb36ec9107799/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:82d664628865abeb32d90ae497fb93df398a69bb3434463d172b80fc25b0dd7d", size = 48880, upload-time = "2025-06-09T23:01:39.887Z" }, + { url = "https://files.pythonhosted.org/packages/69/86/f9596807b03de126e11e7d42ac91e3d0b19a6599c714a1989a4e85eeefc4/frozenlist-1.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:912a7e8375a1c9a68325a902f3953191b7b292aa3c3fb0d71a216221deca460b", size = 48498, upload-time = "2025-06-09T23:01:41.318Z" }, + { url = "https://files.pythonhosted.org/packages/5e/cb/df6de220f5036001005f2d726b789b2c0b65f2363b104bbc16f5be8084f8/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9537c2777167488d539bc5de2ad262efc44388230e5118868e172dd4a552b146", size = 292296, upload-time = "2025-06-09T23:01:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/83/1f/de84c642f17c8f851a2905cee2dae401e5e0daca9b5ef121e120e19aa825/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f34560fb1b4c3e30ba35fa9a13894ba39e5acfc5f60f57d8accde65f46cc5e74", size = 273103, upload-time = "2025-06-09T23:01:44.166Z" }, + { url = "https://files.pythonhosted.org/packages/88/3c/c840bfa474ba3fa13c772b93070893c6e9d5c0350885760376cbe3b6c1b3/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acd03d224b0175f5a850edc104ac19040d35419eddad04e7cf2d5986d98427f1", size = 292869, upload-time = "2025-06-09T23:01:45.681Z" }, + { url = "https://files.pythonhosted.org/packages/a6/1c/3efa6e7d5a39a1d5ef0abeb51c48fb657765794a46cf124e5aca2c7a592c/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2038310bc582f3d6a09b3816ab01737d60bf7b1ec70f5356b09e84fb7408ab1", size = 291467, upload-time = "2025-06-09T23:01:47.234Z" }, + { url = "https://files.pythonhosted.org/packages/4f/00/d5c5e09d4922c395e2f2f6b79b9a20dab4b67daaf78ab92e7729341f61f6/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8c05e4c8e5f36e5e088caa1bf78a687528f83c043706640a92cb76cd6999384", size = 266028, upload-time = "2025-06-09T23:01:48.819Z" }, + { url = "https://files.pythonhosted.org/packages/4e/27/72765be905619dfde25a7f33813ac0341eb6b076abede17a2e3fbfade0cb/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:765bb588c86e47d0b68f23c1bee323d4b703218037765dcf3f25c838c6fecceb", size = 284294, upload-time = "2025-06-09T23:01:50.394Z" }, + { url = "https://files.pythonhosted.org/packages/88/67/c94103a23001b17808eb7dd1200c156bb69fb68e63fcf0693dde4cd6228c/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:32dc2e08c67d86d0969714dd484fd60ff08ff81d1a1e40a77dd34a387e6ebc0c", size = 281898, upload-time = "2025-06-09T23:01:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/42/34/a3e2c00c00f9e2a9db5653bca3fec306349e71aff14ae45ecc6d0951dd24/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:c0303e597eb5a5321b4de9c68e9845ac8f290d2ab3f3e2c864437d3c5a30cd65", size = 290465, upload-time = "2025-06-09T23:01:53.788Z" }, + { url = "https://files.pythonhosted.org/packages/bb/73/f89b7fbce8b0b0c095d82b008afd0590f71ccb3dee6eee41791cf8cd25fd/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:a47f2abb4e29b3a8d0b530f7c3598badc6b134562b1a5caee867f7c62fee51e3", size = 266385, upload-time = "2025-06-09T23:01:55.769Z" }, + { url = "https://files.pythonhosted.org/packages/cd/45/e365fdb554159462ca12df54bc59bfa7a9a273ecc21e99e72e597564d1ae/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:3d688126c242a6fabbd92e02633414d40f50bb6002fa4cf995a1d18051525657", size = 288771, upload-time = "2025-06-09T23:01:57.4Z" }, + { url = "https://files.pythonhosted.org/packages/00/11/47b6117002a0e904f004d70ec5194fe9144f117c33c851e3d51c765962d0/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:4e7e9652b3d367c7bd449a727dc79d5043f48b88d0cbfd4f9f1060cf2b414104", size = 288206, upload-time = "2025-06-09T23:01:58.936Z" }, + { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, + { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, + { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, +] + [[package]] name = "greenlet" version = "3.2.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/3f/74/907bb43af91782e0366b0960af62a8ce1f9398e4291cac7beaeffbee0c04/greenlet-3.2.1.tar.gz", hash = "sha256:9f4dd4b4946b14bb3bf038f81e1d2e535b7d94f1b2a59fdba1293cd9c1a0a4d7", size = 184475, upload-time = "2025-04-22T14:40:18.206Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/d1/e4777b188a04726f6cf69047830d37365b9191017f54caf2f7af336a6f18/greenlet-3.2.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:0ba2811509a30e5f943be048895a983a8daf0b9aa0ac0ead526dfb5d987d80ea", size = 270381, upload-time = "2025-04-22T14:25:43.69Z" }, + { url = "https://files.pythonhosted.org/packages/59/e7/b5b738f5679247ddfcf2179c38945519668dced60c3164c20d55c1a7bb4a/greenlet-3.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4245246e72352b150a1588d43ddc8ab5e306bef924c26571aafafa5d1aaae4e8", size = 637195, upload-time = "2025-04-22T14:53:44.563Z" }, + { url = "https://files.pythonhosted.org/packages/6c/9f/57968c88a5f6bc371364baf983a2e5549cca8f503bfef591b6dd81332cbc/greenlet-3.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7abc0545d8e880779f0c7ce665a1afc3f72f0ca0d5815e2b006cafc4c1cc5840", size = 651381, upload-time = "2025-04-22T14:54:59.439Z" }, + { url = "https://files.pythonhosted.org/packages/40/81/1533c9a458e9f2ebccb3ae22f1463b2093b0eb448a88aac36182f1c2cd3d/greenlet-3.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6dcc6d604a6575c6225ac0da39df9335cc0c6ac50725063fa90f104f3dbdb2c9", size = 646110, upload-time = "2025-04-22T15:04:35.739Z" }, + { url = "https://files.pythonhosted.org/packages/06/66/25f7e4b1468ebe4a520757f2e41c2a36a2f49a12e963431b82e9f98df2a0/greenlet-3.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2273586879affca2d1f414709bb1f61f0770adcabf9eda8ef48fd90b36f15d12", size = 648070, upload-time = "2025-04-22T14:27:05.976Z" }, + { url = "https://files.pythonhosted.org/packages/d7/4c/49d366565c4c4d29e6f666287b9e2f471a66c3a3d8d5066692e347f09e27/greenlet-3.2.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff38c869ed30fff07f1452d9a204ece1ec6d3c0870e0ba6e478ce7c1515acf22", size = 603816, upload-time = "2025-04-22T14:25:57.224Z" }, + { url = "https://files.pythonhosted.org/packages/04/15/1612bb61506f44b6b8b6bebb6488702b1fe1432547e95dda57874303a1f5/greenlet-3.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e934591a7a4084fa10ee5ef50eb9d2ac8c4075d5c9cf91128116b5dca49d43b1", size = 1119572, upload-time = "2025-04-22T14:58:58.277Z" }, + { url = "https://files.pythonhosted.org/packages/cc/2f/002b99dacd1610e825876f5cbbe7f86740aa2a6b76816e5eca41c8457e85/greenlet-3.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:063bcf7f8ee28eb91e7f7a8148c65a43b73fbdc0064ab693e024b5a940070145", size = 1147442, upload-time = "2025-04-22T14:28:11.243Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ba/82a2c3b9868644ee6011da742156247070f30e952f4d33f33857458450f2/greenlet-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7132e024ebeeeabbe661cf8878aac5d2e643975c4feae833142592ec2f03263d", size = 296207, upload-time = "2025-04-22T14:54:40.531Z" }, { url = "https://files.pythonhosted.org/packages/77/2a/581b3808afec55b2db838742527c40b4ce68b9b64feedff0fd0123f4b19a/greenlet-3.2.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:e1967882f0c42eaf42282a87579685c8673c51153b845fde1ee81be720ae27ac", size = 269119, upload-time = "2025-04-22T14:25:01.798Z" }, { url = "https://files.pythonhosted.org/packages/b0/f3/1c4e27fbdc84e13f05afc2baf605e704668ffa26e73a43eca93e1120813e/greenlet-3.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e77ae69032a95640a5fe8c857ec7bee569a0997e809570f4c92048691ce4b437", size = 637314, upload-time = "2025-04-22T14:53:46.214Z" }, { url = "https://files.pythonhosted.org/packages/fc/1a/9fc43cb0044f425f7252da9847893b6de4e3b20c0a748bce7ab3f063d5bc/greenlet-3.2.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3227c6ec1149d4520bc99edac3b9bc8358d0034825f3ca7572165cb502d8f29a", size = 651421, upload-time = "2025-04-22T14:55:00.852Z" }, @@ -550,6 +723,69 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/35/1407fb0b2f5b07b50cbaf97fce09ad87d3bfefbf64f7171a8651cd8d2f68/kombu-5.5.3-py3-none-any.whl", hash = "sha256:5b0dbceb4edee50aa464f59469d34b97864be09111338cfb224a10b6a163909b", size = 209921, upload-time = "2025-04-16T12:46:15.139Z" }, ] +[[package]] +name = "multidict" +version = "6.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/f6/512ffd8fd8b37fb2680e5ac35d788f1d71bbaf37789d21a820bdc441e565/multidict-6.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0ffb87be160942d56d7b87b0fdf098e81ed565add09eaa1294268c7f3caac4c8", size = 76516, upload-time = "2025-08-11T12:06:53.393Z" }, + { url = "https://files.pythonhosted.org/packages/99/58/45c3e75deb8855c36bd66cc1658007589662ba584dbf423d01df478dd1c5/multidict-6.6.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d191de6cbab2aff5de6c5723101705fd044b3e4c7cfd587a1929b5028b9714b3", size = 45394, upload-time = "2025-08-11T12:06:54.555Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ca/e8c4472a93a26e4507c0b8e1f0762c0d8a32de1328ef72fd704ef9cc5447/multidict-6.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38a0956dd92d918ad5feff3db8fcb4a5eb7dba114da917e1a88475619781b57b", size = 43591, upload-time = "2025-08-11T12:06:55.672Z" }, + { url = "https://files.pythonhosted.org/packages/05/51/edf414f4df058574a7265034d04c935aa84a89e79ce90fcf4df211f47b16/multidict-6.6.4-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:6865f6d3b7900ae020b495d599fcf3765653bc927951c1abb959017f81ae8287", size = 237215, upload-time = "2025-08-11T12:06:57.213Z" }, + { url = "https://files.pythonhosted.org/packages/c8/45/8b3d6dbad8cf3252553cc41abea09ad527b33ce47a5e199072620b296902/multidict-6.6.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a2088c126b6f72db6c9212ad827d0ba088c01d951cee25e758c450da732c138", size = 258299, upload-time = "2025-08-11T12:06:58.946Z" }, + { url = "https://files.pythonhosted.org/packages/3c/e8/8ca2e9a9f5a435fc6db40438a55730a4bf4956b554e487fa1b9ae920f825/multidict-6.6.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0f37bed7319b848097085d7d48116f545985db988e2256b2e6f00563a3416ee6", size = 242357, upload-time = "2025-08-11T12:07:00.301Z" }, + { url = "https://files.pythonhosted.org/packages/0f/84/80c77c99df05a75c28490b2af8f7cba2a12621186e0a8b0865d8e745c104/multidict-6.6.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:01368e3c94032ba6ca0b78e7ccb099643466cf24f8dc8eefcfdc0571d56e58f9", size = 268369, upload-time = "2025-08-11T12:07:01.638Z" }, + { url = "https://files.pythonhosted.org/packages/0d/e9/920bfa46c27b05fb3e1ad85121fd49f441492dca2449c5bcfe42e4565d8a/multidict-6.6.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fe323540c255db0bffee79ad7f048c909f2ab0edb87a597e1c17da6a54e493c", size = 269341, upload-time = "2025-08-11T12:07:02.943Z" }, + { url = "https://files.pythonhosted.org/packages/af/65/753a2d8b05daf496f4a9c367fe844e90a1b2cac78e2be2c844200d10cc4c/multidict-6.6.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8eb3025f17b0a4c3cd08cda49acf312a19ad6e8a4edd9dbd591e6506d999402", size = 256100, upload-time = "2025-08-11T12:07:04.564Z" }, + { url = "https://files.pythonhosted.org/packages/09/54/655be13ae324212bf0bc15d665a4e34844f34c206f78801be42f7a0a8aaa/multidict-6.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bbc14f0365534d35a06970d6a83478b249752e922d662dc24d489af1aa0d1be7", size = 253584, upload-time = "2025-08-11T12:07:05.914Z" }, + { url = "https://files.pythonhosted.org/packages/5c/74/ab2039ecc05264b5cec73eb018ce417af3ebb384ae9c0e9ed42cb33f8151/multidict-6.6.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:75aa52fba2d96bf972e85451b99d8e19cc37ce26fd016f6d4aa60da9ab2b005f", size = 251018, upload-time = "2025-08-11T12:07:08.301Z" }, + { url = "https://files.pythonhosted.org/packages/af/0a/ccbb244ac848e56c6427f2392741c06302bbfba49c0042f1eb3c5b606497/multidict-6.6.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fefd4a815e362d4f011919d97d7b4a1e566f1dde83dc4ad8cfb5b41de1df68d", size = 251477, upload-time = "2025-08-11T12:07:10.248Z" }, + { url = "https://files.pythonhosted.org/packages/0e/b0/0ed49bba775b135937f52fe13922bc64a7eaf0a3ead84a36e8e4e446e096/multidict-6.6.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:db9801fe021f59a5b375ab778973127ca0ac52429a26e2fd86aa9508f4d26eb7", size = 263575, upload-time = "2025-08-11T12:07:11.928Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d9/7fb85a85e14de2e44dfb6a24f03c41e2af8697a6df83daddb0e9b7569f73/multidict-6.6.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a650629970fa21ac1fb06ba25dabfc5b8a2054fcbf6ae97c758aa956b8dba802", size = 259649, upload-time = "2025-08-11T12:07:13.244Z" }, + { url = "https://files.pythonhosted.org/packages/03/9e/b3a459bcf9b6e74fa461a5222a10ff9b544cb1cd52fd482fb1b75ecda2a2/multidict-6.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:452ff5da78d4720d7516a3a2abd804957532dd69296cb77319c193e3ffb87e24", size = 251505, upload-time = "2025-08-11T12:07:14.57Z" }, + { url = "https://files.pythonhosted.org/packages/86/a2/8022f78f041dfe6d71e364001a5cf987c30edfc83c8a5fb7a3f0974cff39/multidict-6.6.4-cp312-cp312-win32.whl", hash = "sha256:8c2fcb12136530ed19572bbba61b407f655e3953ba669b96a35036a11a485793", size = 41888, upload-time = "2025-08-11T12:07:15.904Z" }, + { url = "https://files.pythonhosted.org/packages/c7/eb/d88b1780d43a56db2cba24289fa744a9d216c1a8546a0dc3956563fd53ea/multidict-6.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:047d9425860a8c9544fed1b9584f0c8bcd31bcde9568b047c5e567a1025ecd6e", size = 46072, upload-time = "2025-08-11T12:07:17.045Z" }, + { url = "https://files.pythonhosted.org/packages/9f/16/b929320bf5750e2d9d4931835a4c638a19d2494a5b519caaaa7492ebe105/multidict-6.6.4-cp312-cp312-win_arm64.whl", hash = "sha256:14754eb72feaa1e8ae528468f24250dd997b8e2188c3d2f593f9eba259e4b364", size = 43222, upload-time = "2025-08-11T12:07:18.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/5d/e1db626f64f60008320aab00fbe4f23fc3300d75892a3381275b3d284580/multidict-6.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f46a6e8597f9bd71b31cc708195d42b634c8527fecbcf93febf1052cacc1f16e", size = 75848, upload-time = "2025-08-11T12:07:19.912Z" }, + { url = "https://files.pythonhosted.org/packages/4c/aa/8b6f548d839b6c13887253af4e29c939af22a18591bfb5d0ee6f1931dae8/multidict-6.6.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:22e38b2bc176c5eb9c0a0e379f9d188ae4cd8b28c0f53b52bce7ab0a9e534657", size = 45060, upload-time = "2025-08-11T12:07:21.163Z" }, + { url = "https://files.pythonhosted.org/packages/eb/c6/f5e97e5d99a729bc2aa58eb3ebfa9f1e56a9b517cc38c60537c81834a73f/multidict-6.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5df8afd26f162da59e218ac0eefaa01b01b2e6cd606cffa46608f699539246da", size = 43269, upload-time = "2025-08-11T12:07:22.392Z" }, + { url = "https://files.pythonhosted.org/packages/dc/31/d54eb0c62516776f36fe67f84a732f97e0b0e12f98d5685bebcc6d396910/multidict-6.6.4-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:49517449b58d043023720aa58e62b2f74ce9b28f740a0b5d33971149553d72aa", size = 237158, upload-time = "2025-08-11T12:07:23.636Z" }, + { url = "https://files.pythonhosted.org/packages/c4/1c/8a10c1c25b23156e63b12165a929d8eb49a6ed769fdbefb06e6f07c1e50d/multidict-6.6.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ae9408439537c5afdca05edd128a63f56a62680f4b3c234301055d7a2000220f", size = 257076, upload-time = "2025-08-11T12:07:25.049Z" }, + { url = "https://files.pythonhosted.org/packages/ad/86/90e20b5771d6805a119e483fd3d1e8393e745a11511aebca41f0da38c3e2/multidict-6.6.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:87a32d20759dc52a9e850fe1061b6e41ab28e2998d44168a8a341b99ded1dba0", size = 240694, upload-time = "2025-08-11T12:07:26.458Z" }, + { url = "https://files.pythonhosted.org/packages/e7/49/484d3e6b535bc0555b52a0a26ba86e4d8d03fd5587d4936dc59ba7583221/multidict-6.6.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:52e3c8d43cdfff587ceedce9deb25e6ae77daba560b626e97a56ddcad3756879", size = 266350, upload-time = "2025-08-11T12:07:27.94Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b4/aa4c5c379b11895083d50021e229e90c408d7d875471cb3abf721e4670d6/multidict-6.6.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ad8850921d3a8d8ff6fbef790e773cecfc260bbfa0566998980d3fa8f520bc4a", size = 267250, upload-time = "2025-08-11T12:07:29.303Z" }, + { url = "https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:497a2954adc25c08daff36f795077f63ad33e13f19bfff7736e72c785391534f", size = 254900, upload-time = "2025-08-11T12:07:30.764Z" }, + { url = "https://files.pythonhosted.org/packages/17/38/58b27fed927c07035abc02befacab42491e7388ca105e087e6e0215ead64/multidict-6.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:024ce601f92d780ca1617ad4be5ac15b501cc2414970ffa2bb2bbc2bd5a68fa5", size = 252355, upload-time = "2025-08-11T12:07:32.205Z" }, + { url = "https://files.pythonhosted.org/packages/d0/a1/dad75d23a90c29c02b5d6f3d7c10ab36c3197613be5d07ec49c7791e186c/multidict-6.6.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:a693fc5ed9bdd1c9e898013e0da4dcc640de7963a371c0bd458e50e046bf6438", size = 250061, upload-time = "2025-08-11T12:07:33.623Z" }, + { url = "https://files.pythonhosted.org/packages/b8/1a/ac2216b61c7f116edab6dc3378cca6c70dc019c9a457ff0d754067c58b20/multidict-6.6.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:190766dac95aab54cae5b152a56520fd99298f32a1266d66d27fdd1b5ac00f4e", size = 249675, upload-time = "2025-08-11T12:07:34.958Z" }, + { url = "https://files.pythonhosted.org/packages/d4/79/1916af833b800d13883e452e8e0977c065c4ee3ab7a26941fbfdebc11895/multidict-6.6.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:34d8f2a5ffdceab9dcd97c7a016deb2308531d5f0fced2bb0c9e1df45b3363d7", size = 261247, upload-time = "2025-08-11T12:07:36.588Z" }, + { url = "https://files.pythonhosted.org/packages/c5/65/d1f84fe08ac44a5fc7391cbc20a7cedc433ea616b266284413fd86062f8c/multidict-6.6.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:59e8d40ab1f5a8597abcef00d04845155a5693b5da00d2c93dbe88f2050f2812", size = 257960, upload-time = "2025-08-11T12:07:39.735Z" }, + { url = "https://files.pythonhosted.org/packages/13/b5/29ec78057d377b195ac2c5248c773703a6b602e132a763e20ec0457e7440/multidict-6.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:467fe64138cfac771f0e949b938c2e1ada2b5af22f39692aa9258715e9ea613a", size = 250078, upload-time = "2025-08-11T12:07:41.525Z" }, + { url = "https://files.pythonhosted.org/packages/c4/0e/7e79d38f70a872cae32e29b0d77024bef7834b0afb406ddae6558d9e2414/multidict-6.6.4-cp313-cp313-win32.whl", hash = "sha256:14616a30fe6d0a48d0a48d1a633ab3b8bec4cf293aac65f32ed116f620adfd69", size = 41708, upload-time = "2025-08-11T12:07:43.405Z" }, + { url = "https://files.pythonhosted.org/packages/9d/34/746696dffff742e97cd6a23da953e55d0ea51fa601fa2ff387b3edcfaa2c/multidict-6.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:40cd05eaeb39e2bc8939451f033e57feaa2ac99e07dbca8afe2be450a4a3b6cf", size = 45912, upload-time = "2025-08-11T12:07:45.082Z" }, + { url = "https://files.pythonhosted.org/packages/c7/87/3bac136181e271e29170d8d71929cdeddeb77f3e8b6a0c08da3a8e9da114/multidict-6.6.4-cp313-cp313-win_arm64.whl", hash = "sha256:f6eb37d511bfae9e13e82cb4d1af36b91150466f24d9b2b8a9785816deb16605", size = 43076, upload-time = "2025-08-11T12:07:46.746Z" }, + { url = "https://files.pythonhosted.org/packages/64/94/0a8e63e36c049b571c9ae41ee301ada29c3fee9643d9c2548d7d558a1d99/multidict-6.6.4-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:6c84378acd4f37d1b507dfa0d459b449e2321b3ba5f2338f9b085cf7a7ba95eb", size = 82812, upload-time = "2025-08-11T12:07:48.402Z" }, + { url = "https://files.pythonhosted.org/packages/25/1a/be8e369dfcd260d2070a67e65dd3990dd635cbd735b98da31e00ea84cd4e/multidict-6.6.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0e0558693063c75f3d952abf645c78f3c5dfdd825a41d8c4d8156fc0b0da6e7e", size = 48313, upload-time = "2025-08-11T12:07:49.679Z" }, + { url = "https://files.pythonhosted.org/packages/26/5a/dd4ade298674b2f9a7b06a32c94ffbc0497354df8285f27317c66433ce3b/multidict-6.6.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3f8e2384cb83ebd23fd07e9eada8ba64afc4c759cd94817433ab8c81ee4b403f", size = 46777, upload-time = "2025-08-11T12:07:51.318Z" }, + { url = "https://files.pythonhosted.org/packages/89/db/98aa28bc7e071bfba611ac2ae803c24e96dd3a452b4118c587d3d872c64c/multidict-6.6.4-cp313-cp313t-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f996b87b420995a9174b2a7c1a8daf7db4750be6848b03eb5e639674f7963773", size = 229321, upload-time = "2025-08-11T12:07:52.965Z" }, + { url = "https://files.pythonhosted.org/packages/c7/bc/01ddda2a73dd9d167bd85d0e8ef4293836a8f82b786c63fb1a429bc3e678/multidict-6.6.4-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc356250cffd6e78416cf5b40dc6a74f1edf3be8e834cf8862d9ed5265cf9b0e", size = 249954, upload-time = "2025-08-11T12:07:54.423Z" }, + { url = "https://files.pythonhosted.org/packages/06/78/6b7c0f020f9aa0acf66d0ab4eb9f08375bac9a50ff5e3edb1c4ccd59eafc/multidict-6.6.4-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:dadf95aa862714ea468a49ad1e09fe00fcc9ec67d122f6596a8d40caf6cec7d0", size = 228612, upload-time = "2025-08-11T12:07:55.914Z" }, + { url = "https://files.pythonhosted.org/packages/00/44/3faa416f89b2d5d76e9d447296a81521e1c832ad6e40b92f990697b43192/multidict-6.6.4-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7dd57515bebffd8ebd714d101d4c434063322e4fe24042e90ced41f18b6d3395", size = 257528, upload-time = "2025-08-11T12:07:57.371Z" }, + { url = "https://files.pythonhosted.org/packages/05/5f/77c03b89af0fcb16f018f668207768191fb9dcfb5e3361a5e706a11db2c9/multidict-6.6.4-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:967af5f238ebc2eb1da4e77af5492219fbd9b4b812347da39a7b5f5c72c0fa45", size = 256329, upload-time = "2025-08-11T12:07:58.844Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e9/ed750a2a9afb4f8dc6f13dc5b67b514832101b95714f1211cd42e0aafc26/multidict-6.6.4-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a4c6875c37aae9794308ec43e3530e4aa0d36579ce38d89979bbf89582002bb", size = 247928, upload-time = "2025-08-11T12:08:01.037Z" }, + { url = "https://files.pythonhosted.org/packages/1f/b5/e0571bc13cda277db7e6e8a532791d4403dacc9850006cb66d2556e649c0/multidict-6.6.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:7f683a551e92bdb7fac545b9c6f9fa2aebdeefa61d607510b3533286fcab67f5", size = 245228, upload-time = "2025-08-11T12:08:02.96Z" }, + { url = "https://files.pythonhosted.org/packages/f3/a3/69a84b0eccb9824491f06368f5b86e72e4af54c3067c37c39099b6687109/multidict-6.6.4-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:3ba5aaf600edaf2a868a391779f7a85d93bed147854925f34edd24cc70a3e141", size = 235869, upload-time = "2025-08-11T12:08:04.746Z" }, + { url = "https://files.pythonhosted.org/packages/a9/9d/28802e8f9121a6a0804fa009debf4e753d0a59969ea9f70be5f5fdfcb18f/multidict-6.6.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:580b643b7fd2c295d83cad90d78419081f53fd532d1f1eb67ceb7060f61cff0d", size = 243446, upload-time = "2025-08-11T12:08:06.332Z" }, + { url = "https://files.pythonhosted.org/packages/38/ea/6c98add069b4878c1d66428a5f5149ddb6d32b1f9836a826ac764b9940be/multidict-6.6.4-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:37b7187197da6af3ee0b044dbc9625afd0c885f2800815b228a0e70f9a7f473d", size = 252299, upload-time = "2025-08-11T12:08:07.931Z" }, + { url = "https://files.pythonhosted.org/packages/3a/09/8fe02d204473e14c0af3affd50af9078839dfca1742f025cca765435d6b4/multidict-6.6.4-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e1b93790ed0bc26feb72e2f08299691ceb6da5e9e14a0d13cc74f1869af327a0", size = 246926, upload-time = "2025-08-11T12:08:09.467Z" }, + { url = "https://files.pythonhosted.org/packages/37/3d/7b1e10d774a6df5175ecd3c92bff069e77bed9ec2a927fdd4ff5fe182f67/multidict-6.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a506a77ddee1efcca81ecbeae27ade3e09cdf21a8ae854d766c2bb4f14053f92", size = 243383, upload-time = "2025-08-11T12:08:10.981Z" }, + { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" }, + { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" }, + { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -605,12 +841,81 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07", size = 387810, upload-time = "2025-04-15T09:18:44.753Z" }, ] +[[package]] +name = "propcache" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/42/9ca01b0a6f48e81615dca4765a8f1dd2c057e0540f6116a27dc5ee01dfb6/propcache-0.3.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8de106b6c84506b31c27168582cd3cb3000a6412c16df14a8628e5871ff83c10", size = 73674, upload-time = "2025-06-09T22:54:30.551Z" }, + { url = "https://files.pythonhosted.org/packages/af/6e/21293133beb550f9c901bbece755d582bfaf2176bee4774000bd4dd41884/propcache-0.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:28710b0d3975117239c76600ea351934ac7b5ff56e60953474342608dbbb6154", size = 43570, upload-time = "2025-06-09T22:54:32.296Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c8/0393a0a3a2b8760eb3bde3c147f62b20044f0ddac81e9d6ed7318ec0d852/propcache-0.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce26862344bdf836650ed2487c3d724b00fbfec4233a1013f597b78c1cb73615", size = 43094, upload-time = "2025-06-09T22:54:33.929Z" }, + { url = "https://files.pythonhosted.org/packages/37/2c/489afe311a690399d04a3e03b069225670c1d489eb7b044a566511c1c498/propcache-0.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bca54bd347a253af2cf4544bbec232ab982f4868de0dd684246b67a51bc6b1db", size = 226958, upload-time = "2025-06-09T22:54:35.186Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/63b520d2f3d418c968bf596839ae26cf7f87bead026b6192d4da6a08c467/propcache-0.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55780d5e9a2ddc59711d727226bb1ba83a22dd32f64ee15594b9392b1f544eb1", size = 234894, upload-time = "2025-06-09T22:54:36.708Z" }, + { url = "https://files.pythonhosted.org/packages/11/60/1d0ed6fff455a028d678df30cc28dcee7af77fa2b0e6962ce1df95c9a2a9/propcache-0.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:035e631be25d6975ed87ab23153db6a73426a48db688070d925aa27e996fe93c", size = 233672, upload-time = "2025-06-09T22:54:38.062Z" }, + { url = "https://files.pythonhosted.org/packages/37/7c/54fd5301ef38505ab235d98827207176a5c9b2aa61939b10a460ca53e123/propcache-0.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee6f22b6eaa39297c751d0e80c0d3a454f112f5c6481214fcf4c092074cecd67", size = 224395, upload-time = "2025-06-09T22:54:39.634Z" }, + { url = "https://files.pythonhosted.org/packages/ee/1a/89a40e0846f5de05fdc6779883bf46ba980e6df4d2ff8fb02643de126592/propcache-0.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ca3aee1aa955438c4dba34fc20a9f390e4c79967257d830f137bd5a8a32ed3b", size = 212510, upload-time = "2025-06-09T22:54:41.565Z" }, + { url = "https://files.pythonhosted.org/packages/5e/33/ca98368586c9566a6b8d5ef66e30484f8da84c0aac3f2d9aec6d31a11bd5/propcache-0.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7a4f30862869fa2b68380d677cc1c5fcf1e0f2b9ea0cf665812895c75d0ca3b8", size = 222949, upload-time = "2025-06-09T22:54:43.038Z" }, + { url = "https://files.pythonhosted.org/packages/ba/11/ace870d0aafe443b33b2f0b7efdb872b7c3abd505bfb4890716ad7865e9d/propcache-0.3.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b77ec3c257d7816d9f3700013639db7491a434644c906a2578a11daf13176251", size = 217258, upload-time = "2025-06-09T22:54:44.376Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d2/86fd6f7adffcfc74b42c10a6b7db721d1d9ca1055c45d39a1a8f2a740a21/propcache-0.3.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cab90ac9d3f14b2d5050928483d3d3b8fb6b4018893fc75710e6aa361ecb2474", size = 213036, upload-time = "2025-06-09T22:54:46.243Z" }, + { url = "https://files.pythonhosted.org/packages/07/94/2d7d1e328f45ff34a0a284cf5a2847013701e24c2a53117e7c280a4316b3/propcache-0.3.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0b504d29f3c47cf6b9e936c1852246c83d450e8e063d50562115a6be6d3a2535", size = 227684, upload-time = "2025-06-09T22:54:47.63Z" }, + { url = "https://files.pythonhosted.org/packages/b7/05/37ae63a0087677e90b1d14710e532ff104d44bc1efa3b3970fff99b891dc/propcache-0.3.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:ce2ac2675a6aa41ddb2a0c9cbff53780a617ac3d43e620f8fd77ba1c84dcfc06", size = 234562, upload-time = "2025-06-09T22:54:48.982Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7c/3f539fcae630408d0bd8bf3208b9a647ccad10976eda62402a80adf8fc34/propcache-0.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b4239611205294cc433845b914131b2a1f03500ff3c1ed093ed216b82621e1", size = 222142, upload-time = "2025-06-09T22:54:50.424Z" }, + { url = "https://files.pythonhosted.org/packages/7c/d2/34b9eac8c35f79f8a962546b3e97e9d4b990c420ee66ac8255d5d9611648/propcache-0.3.2-cp312-cp312-win32.whl", hash = "sha256:df4a81b9b53449ebc90cc4deefb052c1dd934ba85012aa912c7ea7b7e38b60c1", size = 37711, upload-time = "2025-06-09T22:54:52.072Z" }, + { url = "https://files.pythonhosted.org/packages/19/61/d582be5d226cf79071681d1b46b848d6cb03d7b70af7063e33a2787eaa03/propcache-0.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:7046e79b989d7fe457bb755844019e10f693752d169076138abf17f31380800c", size = 41479, upload-time = "2025-06-09T22:54:53.234Z" }, + { url = "https://files.pythonhosted.org/packages/dc/d1/8c747fafa558c603c4ca19d8e20b288aa0c7cda74e9402f50f31eb65267e/propcache-0.3.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ca592ed634a73ca002967458187109265e980422116c0a107cf93d81f95af945", size = 71286, upload-time = "2025-06-09T22:54:54.369Z" }, + { url = "https://files.pythonhosted.org/packages/61/99/d606cb7986b60d89c36de8a85d58764323b3a5ff07770a99d8e993b3fa73/propcache-0.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9ecb0aad4020e275652ba3975740f241bd12a61f1a784df044cf7477a02bc252", size = 42425, upload-time = "2025-06-09T22:54:55.642Z" }, + { url = "https://files.pythonhosted.org/packages/8c/96/ef98f91bbb42b79e9bb82bdd348b255eb9d65f14dbbe3b1594644c4073f7/propcache-0.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7f08f1cc28bd2eade7a8a3d2954ccc673bb02062e3e7da09bc75d843386b342f", size = 41846, upload-time = "2025-06-09T22:54:57.246Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ad/3f0f9a705fb630d175146cd7b1d2bf5555c9beaed54e94132b21aac098a6/propcache-0.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a342c834734edb4be5ecb1e9fb48cb64b1e2320fccbd8c54bf8da8f2a84c33", size = 208871, upload-time = "2025-06-09T22:54:58.975Z" }, + { url = "https://files.pythonhosted.org/packages/3a/38/2085cda93d2c8b6ec3e92af2c89489a36a5886b712a34ab25de9fbca7992/propcache-0.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a544caaae1ac73f1fecfae70ded3e93728831affebd017d53449e3ac052ac1e", size = 215720, upload-time = "2025-06-09T22:55:00.471Z" }, + { url = "https://files.pythonhosted.org/packages/61/c1/d72ea2dc83ac7f2c8e182786ab0fc2c7bd123a1ff9b7975bee671866fe5f/propcache-0.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310d11aa44635298397db47a3ebce7db99a4cc4b9bbdfcf6c98a60c8d5261cf1", size = 215203, upload-time = "2025-06-09T22:55:01.834Z" }, + { url = "https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1396592321ac83157ac03a2023aa6cc4a3cc3cfdecb71090054c09e5a7cce3", size = 206365, upload-time = "2025-06-09T22:55:03.199Z" }, + { url = "https://files.pythonhosted.org/packages/09/73/88549128bb89e66d2aff242488f62869014ae092db63ccea53c1cc75a81d/propcache-0.3.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cabf5b5902272565e78197edb682017d21cf3b550ba0460ee473753f28d23c1", size = 196016, upload-time = "2025-06-09T22:55:04.518Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3f/3bdd14e737d145114a5eb83cb172903afba7242f67c5877f9909a20d948d/propcache-0.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0a2f2235ac46a7aa25bdeb03a9e7060f6ecbd213b1f9101c43b3090ffb971ef6", size = 205596, upload-time = "2025-06-09T22:55:05.942Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ca/2f4aa819c357d3107c3763d7ef42c03980f9ed5c48c82e01e25945d437c1/propcache-0.3.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:92b69e12e34869a6970fd2f3da91669899994b47c98f5d430b781c26f1d9f387", size = 200977, upload-time = "2025-06-09T22:55:07.792Z" }, + { url = "https://files.pythonhosted.org/packages/cd/4a/e65276c7477533c59085251ae88505caf6831c0e85ff8b2e31ebcbb949b1/propcache-0.3.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:54e02207c79968ebbdffc169591009f4474dde3b4679e16634d34c9363ff56b4", size = 197220, upload-time = "2025-06-09T22:55:09.173Z" }, + { url = "https://files.pythonhosted.org/packages/7c/54/fc7152e517cf5578278b242396ce4d4b36795423988ef39bb8cd5bf274c8/propcache-0.3.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4adfb44cb588001f68c5466579d3f1157ca07f7504fc91ec87862e2b8e556b88", size = 210642, upload-time = "2025-06-09T22:55:10.62Z" }, + { url = "https://files.pythonhosted.org/packages/b9/80/abeb4a896d2767bf5f1ea7b92eb7be6a5330645bd7fb844049c0e4045d9d/propcache-0.3.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fd3e6019dc1261cd0291ee8919dd91fbab7b169bb76aeef6c716833a3f65d206", size = 212789, upload-time = "2025-06-09T22:55:12.029Z" }, + { url = "https://files.pythonhosted.org/packages/b3/db/ea12a49aa7b2b6d68a5da8293dcf50068d48d088100ac016ad92a6a780e6/propcache-0.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4c181cad81158d71c41a2bce88edce078458e2dd5ffee7eddd6b05da85079f43", size = 205880, upload-time = "2025-06-09T22:55:13.45Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e5/9076a0bbbfb65d1198007059c65639dfd56266cf8e477a9707e4b1999ff4/propcache-0.3.2-cp313-cp313-win32.whl", hash = "sha256:8a08154613f2249519e549de2330cf8e2071c2887309a7b07fb56098f5170a02", size = 37220, upload-time = "2025-06-09T22:55:15.284Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f5/b369e026b09a26cd77aa88d8fffd69141d2ae00a2abaaf5380d2603f4b7f/propcache-0.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e41671f1594fc4ab0a6dec1351864713cb3a279910ae8b58f884a88a0a632c05", size = 40678, upload-time = "2025-06-09T22:55:16.445Z" }, + { url = "https://files.pythonhosted.org/packages/a4/3a/6ece377b55544941a08d03581c7bc400a3c8cd3c2865900a68d5de79e21f/propcache-0.3.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:9a3cf035bbaf035f109987d9d55dc90e4b0e36e04bbbb95af3055ef17194057b", size = 76560, upload-time = "2025-06-09T22:55:17.598Z" }, + { url = "https://files.pythonhosted.org/packages/0c/da/64a2bb16418740fa634b0e9c3d29edff1db07f56d3546ca2d86ddf0305e1/propcache-0.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:156c03d07dc1323d8dacaa221fbe028c5c70d16709cdd63502778e6c3ccca1b0", size = 44676, upload-time = "2025-06-09T22:55:18.922Z" }, + { url = "https://files.pythonhosted.org/packages/36/7b/f025e06ea51cb72c52fb87e9b395cced02786610b60a3ed51da8af017170/propcache-0.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74413c0ba02ba86f55cf60d18daab219f7e531620c15f1e23d95563f505efe7e", size = 44701, upload-time = "2025-06-09T22:55:20.106Z" }, + { url = "https://files.pythonhosted.org/packages/a4/00/faa1b1b7c3b74fc277f8642f32a4c72ba1d7b2de36d7cdfb676db7f4303e/propcache-0.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f066b437bb3fa39c58ff97ab2ca351db465157d68ed0440abecb21715eb24b28", size = 276934, upload-time = "2025-06-09T22:55:21.5Z" }, + { url = "https://files.pythonhosted.org/packages/74/ab/935beb6f1756e0476a4d5938ff44bf0d13a055fed880caf93859b4f1baf4/propcache-0.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1304b085c83067914721e7e9d9917d41ad87696bf70f0bc7dee450e9c71ad0a", size = 278316, upload-time = "2025-06-09T22:55:22.918Z" }, + { url = "https://files.pythonhosted.org/packages/f8/9d/994a5c1ce4389610838d1caec74bdf0e98b306c70314d46dbe4fcf21a3e2/propcache-0.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab50cef01b372763a13333b4e54021bdcb291fc9a8e2ccb9c2df98be51bcde6c", size = 282619, upload-time = "2025-06-09T22:55:24.651Z" }, + { url = "https://files.pythonhosted.org/packages/2b/00/a10afce3d1ed0287cef2e09506d3be9822513f2c1e96457ee369adb9a6cd/propcache-0.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fad3b2a085ec259ad2c2842666b2a0a49dea8463579c606426128925af1ed725", size = 265896, upload-time = "2025-06-09T22:55:26.049Z" }, + { url = "https://files.pythonhosted.org/packages/2e/a8/2aa6716ffa566ca57c749edb909ad27884680887d68517e4be41b02299f3/propcache-0.3.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:261fa020c1c14deafd54c76b014956e2f86991af198c51139faf41c4d5e83892", size = 252111, upload-time = "2025-06-09T22:55:27.381Z" }, + { url = "https://files.pythonhosted.org/packages/36/4f/345ca9183b85ac29c8694b0941f7484bf419c7f0fea2d1e386b4f7893eed/propcache-0.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:46d7f8aa79c927e5f987ee3a80205c987717d3659f035c85cf0c3680526bdb44", size = 268334, upload-time = "2025-06-09T22:55:28.747Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ca/fcd54f78b59e3f97b3b9715501e3147f5340167733d27db423aa321e7148/propcache-0.3.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:6d8f3f0eebf73e3c0ff0e7853f68be638b4043c65a70517bb575eff54edd8dbe", size = 255026, upload-time = "2025-06-09T22:55:30.184Z" }, + { url = "https://files.pythonhosted.org/packages/8b/95/8e6a6bbbd78ac89c30c225210a5c687790e532ba4088afb8c0445b77ef37/propcache-0.3.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:03c89c1b14a5452cf15403e291c0ccd7751d5b9736ecb2c5bab977ad6c5bcd81", size = 250724, upload-time = "2025-06-09T22:55:31.646Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b0/0dd03616142baba28e8b2d14ce5df6631b4673850a3d4f9c0f9dd714a404/propcache-0.3.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:0cc17efde71e12bbaad086d679ce575268d70bc123a5a71ea7ad76f70ba30bba", size = 268868, upload-time = "2025-06-09T22:55:33.209Z" }, + { url = "https://files.pythonhosted.org/packages/c5/98/2c12407a7e4fbacd94ddd32f3b1e3d5231e77c30ef7162b12a60e2dd5ce3/propcache-0.3.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:acdf05d00696bc0447e278bb53cb04ca72354e562cf88ea6f9107df8e7fd9770", size = 271322, upload-time = "2025-06-09T22:55:35.065Z" }, + { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, + { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, +] + [[package]] name = "psycopg2-binary" version = "2.9.10" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/cb/0e/bdc8274dc0585090b4e3432267d7be4dfbfd8971c0fa59167c711105a6bf/psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2", size = 385764, upload-time = "2024-10-16T11:24:58.126Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/49/7d/465cc9795cf76f6d329efdafca74693714556ea3891813701ac1fee87545/psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0", size = 3044771, upload-time = "2024-10-16T11:20:35.234Z" }, + { url = "https://files.pythonhosted.org/packages/8b/31/6d225b7b641a1a2148e3ed65e1aa74fc86ba3fee850545e27be9e1de893d/psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a", size = 3275336, upload-time = "2024-10-16T11:20:38.742Z" }, + { url = "https://files.pythonhosted.org/packages/30/b7/a68c2b4bff1cbb1728e3ec864b2d92327c77ad52edcd27922535a8366f68/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539", size = 2851637, upload-time = "2024-10-16T11:20:42.145Z" }, + { url = "https://files.pythonhosted.org/packages/0b/b1/cfedc0e0e6f9ad61f8657fd173b2f831ce261c02a08c0b09c652b127d813/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526", size = 3082097, upload-time = "2024-10-16T11:20:46.185Z" }, + { url = "https://files.pythonhosted.org/packages/18/ed/0a8e4153c9b769f59c02fb5e7914f20f0b2483a19dae7bf2db54b743d0d0/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1", size = 3264776, upload-time = "2024-10-16T11:20:50.879Z" }, + { url = "https://files.pythonhosted.org/packages/10/db/d09da68c6a0cdab41566b74e0a6068a425f077169bed0946559b7348ebe9/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e", size = 3020968, upload-time = "2024-10-16T11:20:56.819Z" }, + { url = "https://files.pythonhosted.org/packages/94/28/4d6f8c255f0dfffb410db2b3f9ac5218d959a66c715c34cac31081e19b95/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f", size = 2872334, upload-time = "2024-10-16T11:21:02.411Z" }, + { url = "https://files.pythonhosted.org/packages/05/f7/20d7bf796593c4fea95e12119d6cc384ff1f6141a24fbb7df5a668d29d29/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00", size = 2822722, upload-time = "2024-10-16T11:21:09.01Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e4/0c407ae919ef626dbdb32835a03b6737013c3cc7240169843965cada2bdf/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5", size = 2920132, upload-time = "2024-10-16T11:21:16.339Z" }, + { url = "https://files.pythonhosted.org/packages/2d/70/aa69c9f69cf09a01da224909ff6ce8b68faeef476f00f7ec377e8f03be70/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47", size = 2959312, upload-time = "2024-10-16T11:21:25.584Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/213e59854fafe87ba47814bf413ace0dcee33a89c8c8c814faca6bc7cf3c/psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64", size = 1025191, upload-time = "2024-10-16T11:21:29.912Z" }, + { url = "https://files.pythonhosted.org/packages/92/29/06261ea000e2dc1e22907dbbc483a1093665509ea586b29b8986a0e56733/psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0", size = 1164031, upload-time = "2024-10-16T11:21:34.211Z" }, { url = "https://files.pythonhosted.org/packages/3e/30/d41d3ba765609c0763505d565c4d12d8f3c79793f0d0f044ff5a28bf395b/psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d", size = 3044699, upload-time = "2024-10-16T11:21:42.841Z" }, { url = "https://files.pythonhosted.org/packages/35/44/257ddadec7ef04536ba71af6bc6a75ec05c5343004a7ec93006bee66c0bc/psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb", size = 3275245, upload-time = "2024-10-16T11:21:51.989Z" }, { url = "https://files.pythonhosted.org/packages/1b/11/48ea1cd11de67f9efd7262085588790a95d9dfcd9b8a687d46caf7305c1a/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7", size = 2851631, upload-time = "2024-10-16T11:21:57.584Z" }, @@ -705,6 +1010,15 @@ version = "6.0.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, @@ -843,3 +1157,68 @@ sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef wheels = [ { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" }, ] + +[[package]] +name = "yarl" +version = "1.20.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/9a/cb7fad7d73c69f296eda6815e4a2c7ed53fc70c2f136479a91c8e5fbdb6d/yarl-1.20.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdcc4cd244e58593a4379fe60fdee5ac0331f8eb70320a24d591a3be197b94a9", size = 133667, upload-time = "2025-06-10T00:43:44.369Z" }, + { url = "https://files.pythonhosted.org/packages/67/38/688577a1cb1e656e3971fb66a3492501c5a5df56d99722e57c98249e5b8a/yarl-1.20.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b29a2c385a5f5b9c7d9347e5812b6f7ab267193c62d282a540b4fc528c8a9d2a", size = 91025, upload-time = "2025-06-10T00:43:46.295Z" }, + { url = "https://files.pythonhosted.org/packages/50/ec/72991ae51febeb11a42813fc259f0d4c8e0507f2b74b5514618d8b640365/yarl-1.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1112ae8154186dfe2de4732197f59c05a83dc814849a5ced892b708033f40dc2", size = 89709, upload-time = "2025-06-10T00:43:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/99/da/4d798025490e89426e9f976702e5f9482005c548c579bdae792a4c37769e/yarl-1.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90bbd29c4fe234233f7fa2b9b121fb63c321830e5d05b45153a2ca68f7d310ee", size = 352287, upload-time = "2025-06-10T00:43:49.924Z" }, + { url = "https://files.pythonhosted.org/packages/1a/26/54a15c6a567aac1c61b18aa0f4b8aa2e285a52d547d1be8bf48abe2b3991/yarl-1.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:680e19c7ce3710ac4cd964e90dad99bf9b5029372ba0c7cbfcd55e54d90ea819", size = 345429, upload-time = "2025-06-10T00:43:51.7Z" }, + { url = "https://files.pythonhosted.org/packages/d6/95/9dcf2386cb875b234353b93ec43e40219e14900e046bf6ac118f94b1e353/yarl-1.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a979218c1fdb4246a05efc2cc23859d47c89af463a90b99b7c56094daf25a16", size = 365429, upload-time = "2025-06-10T00:43:53.494Z" }, + { url = "https://files.pythonhosted.org/packages/91/b2/33a8750f6a4bc224242a635f5f2cff6d6ad5ba651f6edcccf721992c21a0/yarl-1.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255b468adf57b4a7b65d8aad5b5138dce6a0752c139965711bdcb81bc370e1b6", size = 363862, upload-time = "2025-06-10T00:43:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/98/28/3ab7acc5b51f4434b181b0cee8f1f4b77a65919700a355fb3617f9488874/yarl-1.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a97d67108e79cfe22e2b430d80d7571ae57d19f17cda8bb967057ca8a7bf5bfd", size = 355616, upload-time = "2025-06-10T00:43:58.056Z" }, + { url = "https://files.pythonhosted.org/packages/36/a3/f666894aa947a371724ec7cd2e5daa78ee8a777b21509b4252dd7bd15e29/yarl-1.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8570d998db4ddbfb9a590b185a0a33dbf8aafb831d07a5257b4ec9948df9cb0a", size = 339954, upload-time = "2025-06-10T00:43:59.773Z" }, + { url = "https://files.pythonhosted.org/packages/f1/81/5f466427e09773c04219d3450d7a1256138a010b6c9f0af2d48565e9ad13/yarl-1.20.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:97c75596019baae7c71ccf1d8cc4738bc08134060d0adfcbe5642f778d1dca38", size = 365575, upload-time = "2025-06-10T00:44:02.051Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e3/e4b0ad8403e97e6c9972dd587388940a032f030ebec196ab81a3b8e94d31/yarl-1.20.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1c48912653e63aef91ff988c5432832692ac5a1d8f0fb8a33091520b5bbe19ef", size = 365061, upload-time = "2025-06-10T00:44:04.196Z" }, + { url = "https://files.pythonhosted.org/packages/ac/99/b8a142e79eb86c926f9f06452eb13ecb1bb5713bd01dc0038faf5452e544/yarl-1.20.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4c3ae28f3ae1563c50f3d37f064ddb1511ecc1d5584e88c6b7c63cf7702a6d5f", size = 364142, upload-time = "2025-06-10T00:44:06.527Z" }, + { url = "https://files.pythonhosted.org/packages/34/f2/08ed34a4a506d82a1a3e5bab99ccd930a040f9b6449e9fd050320e45845c/yarl-1.20.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c5e9642f27036283550f5f57dc6156c51084b458570b9d0d96100c8bebb186a8", size = 381894, upload-time = "2025-06-10T00:44:08.379Z" }, + { url = "https://files.pythonhosted.org/packages/92/f8/9a3fbf0968eac704f681726eff595dce9b49c8a25cd92bf83df209668285/yarl-1.20.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2c26b0c49220d5799f7b22c6838409ee9bc58ee5c95361a4d7831f03cc225b5a", size = 383378, upload-time = "2025-06-10T00:44:10.51Z" }, + { url = "https://files.pythonhosted.org/packages/af/85/9363f77bdfa1e4d690957cd39d192c4cacd1c58965df0470a4905253b54f/yarl-1.20.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564ab3d517e3d01c408c67f2e5247aad4019dcf1969982aba3974b4093279004", size = 374069, upload-time = "2025-06-10T00:44:12.834Z" }, + { url = "https://files.pythonhosted.org/packages/35/99/9918c8739ba271dcd935400cff8b32e3cd319eaf02fcd023d5dcd487a7c8/yarl-1.20.1-cp312-cp312-win32.whl", hash = "sha256:daea0d313868da1cf2fac6b2d3a25c6e3a9e879483244be38c8e6a41f1d876a5", size = 81249, upload-time = "2025-06-10T00:44:14.731Z" }, + { url = "https://files.pythonhosted.org/packages/eb/83/5d9092950565481b413b31a23e75dd3418ff0a277d6e0abf3729d4d1ce25/yarl-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:48ea7d7f9be0487339828a4de0360d7ce0efc06524a48e1810f945c45b813698", size = 86710, upload-time = "2025-06-10T00:44:16.716Z" }, + { url = "https://files.pythonhosted.org/packages/8a/e1/2411b6d7f769a07687acee88a062af5833cf1966b7266f3d8dfb3d3dc7d3/yarl-1.20.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:0b5ff0fbb7c9f1b1b5ab53330acbfc5247893069e7716840c8e7d5bb7355038a", size = 131811, upload-time = "2025-06-10T00:44:18.933Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/584394e1cb76fb771371770eccad35de400e7b434ce3142c2dd27392c968/yarl-1.20.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:14f326acd845c2b2e2eb38fb1346c94f7f3b01a4f5c788f8144f9b630bfff9a3", size = 90078, upload-time = "2025-06-10T00:44:20.635Z" }, + { url = "https://files.pythonhosted.org/packages/bf/9a/3246ae92d4049099f52d9b0fe3486e3b500e29b7ea872d0f152966fc209d/yarl-1.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f60e4ad5db23f0b96e49c018596707c3ae89f5d0bd97f0ad3684bcbad899f1e7", size = 88748, upload-time = "2025-06-10T00:44:22.34Z" }, + { url = "https://files.pythonhosted.org/packages/a3/25/35afe384e31115a1a801fbcf84012d7a066d89035befae7c5d4284df1e03/yarl-1.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:49bdd1b8e00ce57e68ba51916e4bb04461746e794e7c4d4bbc42ba2f18297691", size = 349595, upload-time = "2025-06-10T00:44:24.314Z" }, + { url = "https://files.pythonhosted.org/packages/28/2d/8aca6cb2cabc8f12efcb82749b9cefecbccfc7b0384e56cd71058ccee433/yarl-1.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:66252d780b45189975abfed839616e8fd2dbacbdc262105ad7742c6ae58f3e31", size = 342616, upload-time = "2025-06-10T00:44:26.167Z" }, + { url = "https://files.pythonhosted.org/packages/0b/e9/1312633d16b31acf0098d30440ca855e3492d66623dafb8e25b03d00c3da/yarl-1.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59174e7332f5d153d8f7452a102b103e2e74035ad085f404df2e40e663a22b28", size = 361324, upload-time = "2025-06-10T00:44:27.915Z" }, + { url = "https://files.pythonhosted.org/packages/bc/a0/688cc99463f12f7669eec7c8acc71ef56a1521b99eab7cd3abb75af887b0/yarl-1.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3968ec7d92a0c0f9ac34d5ecfd03869ec0cab0697c91a45db3fbbd95fe1b653", size = 359676, upload-time = "2025-06-10T00:44:30.041Z" }, + { url = "https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a4fbb50e14396ba3d375f68bfe02215d8e7bc3ec49da8341fe3157f59d2ff5", size = 352614, upload-time = "2025-06-10T00:44:32.171Z" }, + { url = "https://files.pythonhosted.org/packages/b1/91/31163295e82b8d5485d31d9cf7754d973d41915cadce070491778d9c9825/yarl-1.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11a62c839c3a8eac2410e951301309426f368388ff2f33799052787035793b02", size = 336766, upload-time = "2025-06-10T00:44:34.494Z" }, + { url = "https://files.pythonhosted.org/packages/b4/8e/c41a5bc482121f51c083c4c2bcd16b9e01e1cf8729e380273a952513a21f/yarl-1.20.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:041eaa14f73ff5a8986b4388ac6bb43a77f2ea09bf1913df7a35d4646db69e53", size = 364615, upload-time = "2025-06-10T00:44:36.856Z" }, + { url = "https://files.pythonhosted.org/packages/e3/5b/61a3b054238d33d70ea06ebba7e58597891b71c699e247df35cc984ab393/yarl-1.20.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:377fae2fef158e8fd9d60b4c8751387b8d1fb121d3d0b8e9b0be07d1b41e83dc", size = 360982, upload-time = "2025-06-10T00:44:39.141Z" }, + { url = "https://files.pythonhosted.org/packages/df/a3/6a72fb83f8d478cb201d14927bc8040af901811a88e0ff2da7842dd0ed19/yarl-1.20.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1c92f4390e407513f619d49319023664643d3339bd5e5a56a3bebe01bc67ec04", size = 369792, upload-time = "2025-06-10T00:44:40.934Z" }, + { url = "https://files.pythonhosted.org/packages/7c/af/4cc3c36dfc7c077f8dedb561eb21f69e1e9f2456b91b593882b0b18c19dc/yarl-1.20.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d25ddcf954df1754ab0f86bb696af765c5bfaba39b74095f27eececa049ef9a4", size = 382049, upload-time = "2025-06-10T00:44:42.854Z" }, + { url = "https://files.pythonhosted.org/packages/19/3a/e54e2c4752160115183a66dc9ee75a153f81f3ab2ba4bf79c3c53b33de34/yarl-1.20.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:909313577e9619dcff8c31a0ea2aa0a2a828341d92673015456b3ae492e7317b", size = 384774, upload-time = "2025-06-10T00:44:45.275Z" }, + { url = "https://files.pythonhosted.org/packages/9c/20/200ae86dabfca89060ec6447649f219b4cbd94531e425e50d57e5f5ac330/yarl-1.20.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:793fd0580cb9664548c6b83c63b43c477212c0260891ddf86809e1c06c8b08f1", size = 374252, upload-time = "2025-06-10T00:44:47.31Z" }, + { url = "https://files.pythonhosted.org/packages/83/75/11ee332f2f516b3d094e89448da73d557687f7d137d5a0f48c40ff211487/yarl-1.20.1-cp313-cp313-win32.whl", hash = "sha256:468f6e40285de5a5b3c44981ca3a319a4b208ccc07d526b20b12aeedcfa654b7", size = 81198, upload-time = "2025-06-10T00:44:49.164Z" }, + { url = "https://files.pythonhosted.org/packages/ba/ba/39b1ecbf51620b40ab402b0fc817f0ff750f6d92712b44689c2c215be89d/yarl-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:495b4ef2fea40596bfc0affe3837411d6aa3371abcf31aac0ccc4bdd64d4ef5c", size = 86346, upload-time = "2025-06-10T00:44:51.182Z" }, + { url = "https://files.pythonhosted.org/packages/43/c7/669c52519dca4c95153c8ad96dd123c79f354a376346b198f438e56ffeb4/yarl-1.20.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:f60233b98423aab21d249a30eb27c389c14929f47be8430efa7dbd91493a729d", size = 138826, upload-time = "2025-06-10T00:44:52.883Z" }, + { url = "https://files.pythonhosted.org/packages/6a/42/fc0053719b44f6ad04a75d7f05e0e9674d45ef62f2d9ad2c1163e5c05827/yarl-1.20.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6f3eff4cc3f03d650d8755c6eefc844edde99d641d0dcf4da3ab27141a5f8ddf", size = 93217, upload-time = "2025-06-10T00:44:54.658Z" }, + { url = "https://files.pythonhosted.org/packages/4f/7f/fa59c4c27e2a076bba0d959386e26eba77eb52ea4a0aac48e3515c186b4c/yarl-1.20.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:69ff8439d8ba832d6bed88af2c2b3445977eba9a4588b787b32945871c2444e3", size = 92700, upload-time = "2025-06-10T00:44:56.784Z" }, + { url = "https://files.pythonhosted.org/packages/2f/d4/062b2f48e7c93481e88eff97a6312dca15ea200e959f23e96d8ab898c5b8/yarl-1.20.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cf34efa60eb81dd2645a2e13e00bb98b76c35ab5061a3989c7a70f78c85006d", size = 347644, upload-time = "2025-06-10T00:44:59.071Z" }, + { url = "https://files.pythonhosted.org/packages/89/47/78b7f40d13c8f62b499cc702fdf69e090455518ae544c00a3bf4afc9fc77/yarl-1.20.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8e0fe9364ad0fddab2688ce72cb7a8e61ea42eff3c7caeeb83874a5d479c896c", size = 323452, upload-time = "2025-06-10T00:45:01.605Z" }, + { url = "https://files.pythonhosted.org/packages/eb/2b/490d3b2dc66f52987d4ee0d3090a147ea67732ce6b4d61e362c1846d0d32/yarl-1.20.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f64fbf81878ba914562c672024089e3401974a39767747691c65080a67b18c1", size = 346378, upload-time = "2025-06-10T00:45:03.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/ad/775da9c8a94ce925d1537f939a4f17d782efef1f973039d821cbe4bcc211/yarl-1.20.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6342d643bf9a1de97e512e45e4b9560a043347e779a173250824f8b254bd5ce", size = 353261, upload-time = "2025-06-10T00:45:05.992Z" }, + { url = "https://files.pythonhosted.org/packages/4b/23/0ed0922b47a4f5c6eb9065d5ff1e459747226ddce5c6a4c111e728c9f701/yarl-1.20.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56dac5f452ed25eef0f6e3c6a066c6ab68971d96a9fb441791cad0efba6140d3", size = 335987, upload-time = "2025-06-10T00:45:08.227Z" }, + { url = "https://files.pythonhosted.org/packages/3e/49/bc728a7fe7d0e9336e2b78f0958a2d6b288ba89f25a1762407a222bf53c3/yarl-1.20.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7d7f497126d65e2cad8dc5f97d34c27b19199b6414a40cb36b52f41b79014be", size = 329361, upload-time = "2025-06-10T00:45:10.11Z" }, + { url = "https://files.pythonhosted.org/packages/93/8f/b811b9d1f617c83c907e7082a76e2b92b655400e61730cd61a1f67178393/yarl-1.20.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:67e708dfb8e78d8a19169818eeb5c7a80717562de9051bf2413aca8e3696bf16", size = 346460, upload-time = "2025-06-10T00:45:12.055Z" }, + { url = "https://files.pythonhosted.org/packages/70/fd/af94f04f275f95da2c3b8b5e1d49e3e79f1ed8b6ceb0f1664cbd902773ff/yarl-1.20.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:595c07bc79af2494365cc96ddeb772f76272364ef7c80fb892ef9d0649586513", size = 334486, upload-time = "2025-06-10T00:45:13.995Z" }, + { url = "https://files.pythonhosted.org/packages/84/65/04c62e82704e7dd0a9b3f61dbaa8447f8507655fd16c51da0637b39b2910/yarl-1.20.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7bdd2f80f4a7df852ab9ab49484a4dee8030023aa536df41f2d922fd57bf023f", size = 342219, upload-time = "2025-06-10T00:45:16.479Z" }, + { url = "https://files.pythonhosted.org/packages/91/95/459ca62eb958381b342d94ab9a4b6aec1ddec1f7057c487e926f03c06d30/yarl-1.20.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c03bfebc4ae8d862f853a9757199677ab74ec25424d0ebd68a0027e9c639a390", size = 350693, upload-time = "2025-06-10T00:45:18.399Z" }, + { url = "https://files.pythonhosted.org/packages/a6/00/d393e82dd955ad20617abc546a8f1aee40534d599ff555ea053d0ec9bf03/yarl-1.20.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:344d1103e9c1523f32a5ed704d576172d2cabed3122ea90b1d4e11fe17c66458", size = 355803, upload-time = "2025-06-10T00:45:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, + { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, + { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, +] From cdbf6d6de11fa79c87f04519f43a41fd5e97df69 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sat, 4 Oct 2025 14:44:43 +0800 Subject: [PATCH 11/20] perf(spider): improve async efficency --- apps/spider/crawlers/orc.py | 254 +++++++++++++++++++++++++++--------- 1 file changed, 192 insertions(+), 62 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 331f994..9dec887 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -148,6 +148,11 @@ def get_all_courses(self, use_cache=True, save_cache=True): courses_data, course_details, prerequisites, official_data ) + print( + f"DEBUG: Integrated data count: {len(integrated_data) if integrated_data else 0}" + ) + print(f"DEBUG: Save cache enabled: {save_cache}") + # Save to cache if save_cache and integrated_data: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -157,9 +162,9 @@ def get_all_courses(self, use_cache=True, save_cache=True): if use_official: data_sources.append("official") - cache_filename = f"courses_{'_'.join(data_sources)}_{timestamp}" + cache_filename = f"courses_{'_'.join(data_sources)}" cache_filepath = cache_manager.save_to_jsonl( - integrated_data, cache_filename + integrated_data, cache_filename, timestamp ) print(f"Data cached to: {cache_filepath}") @@ -327,25 +332,48 @@ def _get_official_website_data(self): return {} async def _get_official_website_data_async(self): - """Async version of official website data fetching with concurrency""" + """Optimized async version of official website data fetching with enhanced concurrency""" # Get all course URLs from official website official_urls = self._get_official_course_urls() + print(f"DEBUG: Found {len(official_urls)} official course URLs") + if not official_urls: logger.warning("No official course URLs found") return {} logger.info(f"Found {len(official_urls)} course URLs to crawl") - # Create aiohttp session with timeout and headers - timeout = aiohttp.ClientTimeout(total=30, connect=10) + # Optimized timeout and session settings for maximum speed + timeout = aiohttp.ClientTimeout(total=15, connect=3, sock_read=10) headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Cache-Control": "max-age=0", } - async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session: - # Create semaphore to limit concurrent requests - semaphore = asyncio.Semaphore(10) # Max 10 concurrent requests + # Create connector with balanced performance and stability settings + connector = aiohttp.TCPConnector( + limit=50, # Reduced total connection pool size + limit_per_host=25, # Reduced connections per host + ttl_dns_cache=600, # Longer DNS cache for 10 minutes + use_dns_cache=True, + keepalive_timeout=30, # Shorter keepalive to prevent hangs + enable_cleanup_closed=True, + ) + + async with aiohttp.ClientSession( + timeout=timeout, + headers=headers, + connector=connector, + read_bufsize=32768, # Smaller read buffer for stability + ) as session: + # Balanced concurrent requests for stability + semaphore = asyncio.Semaphore(20) # Reduced to 20 concurrent requests # Create tasks for all URLs tasks = [ @@ -353,111 +381,197 @@ async def _get_official_website_data_async(self): for url in official_urls ] - # Execute all tasks concurrently with progress tracking + # Execute all tasks concurrently with better progress tracking official_data = {} completed = 0 total = len(tasks) - for coro in asyncio.as_completed(tasks): - try: - course_data = await coro - completed += 1 + # Use gather for better performance than as_completed + try: + print( + f"DEBUG: Starting to crawl {total} URLs with 20 concurrent requests..." + ) + results = await asyncio.gather(*tasks, return_exceptions=True) - if completed % 5 == 0 or completed == total: - logger.info(f"Progress: {completed}/{total} courses crawled") + print(f"DEBUG: Received {len(results)} results") - if course_data and course_data.get("course_code"): - official_data[course_data["course_code"]] = course_data + successful = 0 + failed = 0 - except Exception as e: - logger.warning(f"Failed to crawl one course: {str(e)}") + for i, result in enumerate(results): completed += 1 + # More frequent progress reporting for better visibility + if completed % 20 == 0 or completed == total: + print( + f"DEBUG: Progress: {completed}/{total} ({completed / total * 100:.1f}%) - Success: {successful}, Failed: {failed}" + ) + logger.info( + f"Progress: {completed}/{total} courses processed ({completed / total * 100:.1f}%)" + ) + + if isinstance(result, Exception): + failed += 1 + logger.warning(f"Failed to crawl course {i + 1}: {str(result)}") + continue + + if ( + result + and isinstance(result, dict) + and result.get("course_code") + ): + successful += 1 + official_data[result["course_code"]] = result + else: + failed += 1 + + except Exception as e: + print(f"DEBUG: Batch crawling failed: {str(e)}") + logger.error(f"Batch crawling failed: {str(e)}") + return {} + + print( + f"DEBUG: Successfully extracted {len(official_data)} courses from {total} URLs" + ) + print(f"DEBUG: Final stats - Success: {successful}, Failed: {failed}") logger.info( - f"Successfully fetched official data for {len(official_data)} courses" + f"Successfully fetched official data for {len(official_data)} courses out of {total} total" ) return official_data async def _crawl_official_course_data_async(self, session, semaphore, course_url): - """Async crawl single course data from official website""" + """Ultra-optimized async crawl single course data from official website""" async with semaphore: # Limit concurrent requests try: - async with session.get(course_url) as response: - if response.status != 200: - logger.warning(f"HTTP {response.status} for {course_url}") - return None - - html_content = await response.text() - return self._parse_official_course_html(html_content, course_url) + # Balanced retry logic for stability + max_retries = 2 # Increased retries for stability + retry_delay = 1.0 # Longer retry delay + + for attempt in range(max_retries + 1): + try: + async with session.get(course_url) as response: + if response.status == 200: + html_content = await response.text() + return self._parse_official_course_html( + html_content, course_url + ) + elif response.status in [ + 429, + 503, + 502, + 504, + ]: # Retry on server errors + if attempt < max_retries: + await asyncio.sleep( + retry_delay * (attempt + 1) + ) # Exponential backoff + continue + # Don't retry on other errors, fail fast + return None + + except (aiohttp.ClientError, asyncio.TimeoutError) as e: + if attempt < max_retries: + await asyncio.sleep(retry_delay * (attempt + 1)) + continue + else: + logger.debug( + f"Failed to fetch {course_url} after {max_retries + 1} attempts: {str(e)}" + ) + return None - except asyncio.TimeoutError: - logger.warning(f"Timeout for {course_url}") return None + except Exception as e: - logger.warning(f"Error crawling {course_url}: {str(e)}") + logger.debug(f"Unexpected error fetching {course_url}: {str(e)}") return None def _parse_official_course_html(self, html_content, course_url): - """Parse HTML content to extract course data""" + """Ultra-fast HTML parsing for course data extraction""" try: from bs4 import BeautifulSoup - soup = BeautifulSoup(html_content, "html.parser") + # Use faster lxml parser for better performance + soup = BeautifulSoup(html_content, "lxml") course_heading = soup.find("h2") if not course_heading: + print(f"DEBUG: No h2 heading found in {course_url}") return None course_heading_text = course_heading.get_text() if not course_heading_text: + print(f"DEBUG: Empty h2 text in {course_url}") return None split_course_heading = course_heading_text.split(" – ") if len(split_course_heading) < 2: + print( + f"DEBUG: Invalid heading format '{course_heading_text}' in {course_url}" + ) return None - # Find course content sections + # Fast extraction with minimal processing text_inner_sections = soup.find_all(class_="et_pb_text_inner") if len(text_inner_sections) < 4: + print( + f"DEBUG: Insufficient text sections ({len(text_inner_sections)}) in {course_url}" + ) return None - children = list(text_inner_sections[3].children) - course_code = split_course_heading[0] course_title = split_course_heading[1] + print(f"DEBUG: Successfully parsed {course_code} - {course_title}") + + # Fast description and topics extraction description = "" course_topics = [] - official_url = course_url - - for i, child in enumerate(children): - text = child.get_text(strip=True) if hasattr(child, "get_text") else "" - if "Description:" in text: - description = ( - children[i + 2].get_text(strip=True) - if i + 2 < len(children) - and hasattr(children[i + 2], "get_text") - else "" + + # Get all text content at once for faster processing + content_section = text_inner_sections[3] + all_text = content_section.get_text(separator="\n", strip=True) + + # Simple text processing for speed + lines = [line.strip() for line in all_text.split("\n") if line.strip()] + + in_description = False + in_topics = False + + for i, line in enumerate(lines): + if "Description:" in line: + in_description = True + continue + elif "Course Topics:" in line or "Course topics:" in line: + in_description = False + in_topics = True + continue + elif ( + in_description + and line + and not any( + x in line for x in ["Course Topics", "Lectures", "Seminars"] ) - if description == "\n" or "Course Topics" in description: - description = "" - elif "Course Topics:" in text: - if i + 2 < len(children) and hasattr(children[i + 2], "find_all"): - course_topics = [ - li.get_text(strip=True) - for li in children[i + 2].find_all("li") - ] + ): + if description: + description += " " + line + else: + description = line + elif in_topics and line: + # Simple topic extraction + clean_line = line.lstrip("•-*").strip() + if clean_line and len(course_topics) < 10: # Limit for performance + course_topics.append(clean_line) return { "course_code": course_code, "course_title": course_title, - "description": description, + "description": description.strip(), "course_topics": course_topics, - "official_url": official_url, + "official_url": course_url, } - except Exception as e: - logger.warning(f"Error parsing course HTML from {course_url}: {str(e)}") + except Exception: + # Fast fail for maximum performance return None def _get_official_course_urls(self): @@ -465,7 +579,13 @@ def _get_official_course_urls(self): try: from bs4 import Tag + print(f"DEBUG: Fetching course URLs from {OFFICIAL_UNDERGRAD_URL}") soup = retrieve_soup(OFFICIAL_UNDERGRAD_URL) + + if not soup: + print("DEBUG: Failed to retrieve soup from official website") + return set() + linked_urls = [] for a in soup.find_all("a", href=True): @@ -476,12 +596,22 @@ def _get_official_course_urls(self): full_url = urljoin(OFFICIAL_BASE_URL, href) linked_urls.append(full_url) - return { + print(f"DEBUG: Found {len(linked_urls)} total links") + + course_urls = { linked_url for linked_url in linked_urls if self._is_official_course_url(linked_url) } + + print(f"DEBUG: Filtered to {len(course_urls)} course URLs") + if len(course_urls) > 0: + print(f"DEBUG: Sample course URL: {list(course_urls)[0]}") + + return course_urls + except Exception as e: + print(f"DEBUG: Error getting official course URLs: {str(e)}") logger.error(f"Error getting official course URLs: {str(e)}") return set() @@ -504,7 +634,7 @@ def _integrate_course_data( courses_with_prereqs = 0 # If we have course selection data, process it - if courses_data: + if courses_data and len(courses_data) > 0: courses_by_code = defaultdict(list) for course in courses_data: course_code = course.get("courseCode") @@ -535,7 +665,7 @@ def _integrate_course_data( integrated_courses.append(course_data) # If we only have official data (no course selection data), create courses from official data - elif official_data: + if (not courses_data or len(courses_data) == 0) and official_data: logger.info("Creating courses from official website data only") for course_code, official_info in official_data.items(): # Create empty main_data for courses that only exist in official website From 402515ec1c3a42c676c6fcbce1d01813290cff48 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sat, 4 Oct 2025 17:25:32 +0800 Subject: [PATCH 12/20] docs(crawlers): update api description --- apps/spider/crawlers/README.md | 331 +++++++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 apps/spider/crawlers/README.md diff --git a/apps/spider/crawlers/README.md b/apps/spider/crawlers/README.md new file mode 100644 index 0000000..00b502b --- /dev/null +++ b/apps/spider/crawlers/README.md @@ -0,0 +1,331 @@ +# API Information + +## https://coursesel.umji.sjtu.edu.cn/jdji/tpm/findOwnCollegeCourse_JiCourse.action + +### Data Fields +- `courseCode` +- `courseId` +- `courseName` +- `courseNameEn` +- `credit` +- `hour` +- `isCompulsory` +- `status` + +### Sample Response +```json +[ + { + "courseAttribute": 0, + "courseCatalog": "", + "courseCategory": "CourseCategory.2", + "courseCode": "ECE3300J", + "courseId": "008697DB-3EE0-4330-8787-A4169688C039", + "courseName": "电磁学(2)", + "courseNameEn": "Electromagnetics II", + "courseTypeId": "86572329-1D6E-469F-AD25-DCEDDCC85F08", + "courseTypeName": "", + "credit": "4", + "creditOuter": "0", + "crossTerm": 0, + "departmentIds": "", + "departmentNames": "", + "departments": [], + "description": "Standard Course Profile Form-Ve330.pdf", + "experimentHour": "", + "faceTeachingHour": "", + "hour": "60", + "isCompulsory": 0, + "language": "en_US", + "lastUpdateDate": "2017-10-27 15:35:00", + "lastUpdateUserId": "DFF3D5F4-65C6-462C-A2E3-559FDB2FA2E2", + "memo": "", + "prerequisiteCourseCount": "", + "retakeStudyScore": 66, + "selfStudyHour": "", + "shortName": "ECE3300J", + "status": 1, + "teacherId": "", + "teacherName": "" + }, + { + "courseAttribute": 0, + "courseCatalog": "", + "courseCategory": "CourseCategory.3", + "courseCode": "ENGL1530J", + "courseId": "00EF66A8-DA13-42C4-892C-DD9A103E1058", + "courseName": "从小说到电影", + "courseNameEn": "Novel into Film", + "courseTypeId": "91EB55FA-20DF-4E04-9402-1B33A19BA55C", + "courseTypeName": "", + "credit": "3", + "creditOuter": "", + "crossTerm": 0, + "departmentIds": "", + "departmentNames": "", + "departments": [], + "description": "", + "experimentHour": "", + "faceTeachingHour": "", + "hour": "45", + "isCompulsory": 0, + "language": "en_US", + "lastUpdateDate": "2019-04-16 14:39:50", + "lastUpdateUserId": "DFF3D5F4-65C6-462C-A2E3-559FDB2FA2E2", + "memo": "", + "prerequisiteCourseCount": "", + "retakeStudyScore": 66, + "selfStudyHour": "", + "shortName": "ENGL1530J", + "status": 1, + "teacherId": "", + "teacherName": "" + }, + { + "courseAttribute": 0, + "courseCatalog": "", + "courseCategory": "CourseCategory.3", + "courseCode": "VR354", + "courseId": "02120A79-97ED-45B1-91E3-F1A0F473E910", + "courseName": "认知心理学概论", + "courseNameEn": "Introduction to Cognitive Psychology", + "courseTypeId": "A54CA3AF-E010-4ABA-B89C-F19AF0E77333", + "courseTypeName": "", + "credit": "3", + "creditOuter": "3", + "crossTerm": 0, + "departmentIds": "", + "departmentNames": "", + "departments": [], + "description": "This course provides students with an overview of cognitive psychology...", + "experimentHour": "", + "faceTeachingHour": "", + "hour": "48", + "isCompulsory": 0, + "language": "en_US", + "lastUpdateDate": "2020-07-28 15:27:50", + "lastUpdateUserId": "11084", + "memo": "", + "prerequisiteCourseCount": "", + "retakeStudyScore": 66, + "selfStudyHour": "", + "shortName": "VR354", + "status": 1, + "teacherId": "", + "teacherName": "" + } +] +``` + + +## https://coursesel.umji.sjtu.edu.cn/tpm/findAll_PrerequisiteCourse.action + +### Data Fields +- courseId +- prerequisiteRule +- prerequisiteRuleDesc +### Sample +```json +[ + { + "courseId": "34F27950-50E7-4D4C-9321-2559FDE2B575", + "lastUpdateDate": "2018-04-18 16:46:48", + "lastUpdateUserId": "60366", + "prerequisiteCourseId": "38CC7A6A-5495-435B-A5AA-9534BED228CD", + "prerequisiteRule": [ + { "type": "symbol", "symbol": "(" }, + { + "type": "course", + "courseId": "561BE836-4EAC-440E-B066-C457A3AA9AE2", + "courseName": "微分方程", + "courseCode": "MATH2160J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "F7AF87CF-7503-4368-83A9-6548B0C50825", + "courseName": "线性代数和微分方程A", + "courseCode": "MATH2560J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "5C590497-E3FA-4128-96EF-7DC8083B43CB", + "courseName": "线性代数和微分方程B", + "courseCode": "MATH2860J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": ")" }, + { "type": "symbol", "symbol": "&&" }, + { "type": "symbol", "symbol": "(" }, + { + "type": "course", + "courseId": "E7070CAD-2453-45EC-B569-7FE117A97C20", + "courseName": "普通物理(2)", + "courseCode": "PHYS2400J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "5E36E16A-FC86-4F1A-99B0-1C022629A584", + "courseName": "强化物理(J类)(2)", + "courseCode": "PHYS2600J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": ")" } + ], + "prerequisiteRuleDesc": "(MATH2160J 已获学分 || MATH2560J 已获学分 || MATH2860J 已获学分) && (PHYS2400J 已获学分 || PHYS2600J 已获学分)" + }, + { + "courseId": "1D8EC601-290F-4DBD-A26C-F93D28CFBE3A", + "lastUpdateDate": "2018-04-18 16:44:38", + "lastUpdateUserId": "60366", + "prerequisiteCourseId": "377474C3-DC59-4A09-95FD-4F0915AFF437", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "78796516-F909-4A3E-A974-897F746BF6CB", + "courseName": "工程概率方法", + "courseCode": "ECE4010J", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "ECE4010J 已获学分" + }, + { + "courseId": "EEE20D83-DC6B-4487-9327-1CB535CE5529", + "lastUpdateDate": "2018-04-18 16:47:40", + "lastUpdateUserId": "60366", + "prerequisiteCourseId": "4B428104-5701-4C5F-86CE-3BE3FC16CCCF", + "prerequisiteRule": [ + { "type": "symbol", "symbol": "(" }, + { + "type": "course", + "courseId": "274CB4C1-75A7-4DA1-807F-CCB9E7F8D038", + "courseName": "线性代数", + "courseCode": "MATH2140J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "5C590497-E3FA-4128-96EF-7DC8083B43CB", + "courseName": "线性代数和微分方程B", + "courseCode": "MATH2860J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": ")" }, + { "type": "symbol", "symbol": "&&" }, + { + "type": "course", + "courseId": "78796516-F909-4A3E-A974-897F746BF6CB", + "courseName": "工程概率方法", + "courseCode": "ECE4010J", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "(MATH2140J 已获学分 || MATH2860J 已获学分) && ECE4010J 已获学分" + }, + { + "courseId": "B423A4E3-1FA2-4BC8-B99C-9FC19221CF59", + "lastUpdateDate": "2017-04-13 17:15:57", + "lastUpdateUserId": "5423DB2A-3C4D-4327-B6D5-0992FA888845", + "prerequisiteCourseId": "07D4D858-B585-4F03-8ABA-814034C5AA86", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "C5FF8EA5-DA16-434B-A617-B63D8633A935", + "courseName": "动态系统建模分析与控制", + "courseCode": "ME3600J", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "ME3600J Obtained Credit" + }, + { + "courseId": "41586CA0-DDAF-4472-BC1C-48DE2136E7A0", + "lastUpdateDate": "2017-05-13 22:23:08", + "lastUpdateUserId": "60366", + "prerequisiteCourseId": "9D9D4F00-8B7A-4E9E-83D5-5B7003475DAB", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "A8D68188-A2F8-47F0-BB35-37C4DEA63CE8", + "courseName": "德语(1)", + "courseCode": "GER1100J", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "A8D68188-A2F8-47F0-BB35-37C4DEA63CE8", + "courseName": "德语(1)", + "courseCode": "GER1100J", + "option": "electedCredit" + }, + { "type": "symbol", "symbol": "||" }, + { + "type": "course", + "courseId": "C20025AB-E429-47AF-852A-04F6B76FABE6", + "courseName": "德语(J类)(1)", + "courseCode": "VW100", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "GER1100J Obtained Credit || GER1100J Credits Submitted || VW100 Obtained Credit" + }, + { + "courseId": "561BE836-4EAC-440E-B066-C457A3AA9AE2", + "lastUpdateDate": "2019-05-05 16:14:47", + "lastUpdateUserId": "F75BEDAE-8658-4FB7-BA0E-9601A9306681", + "prerequisiteCourseId": "B8EDDD41-0578-4A4E-9DB2-2D794222054E", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "D6318202-699F-4B25-B793-CAEF867E2359", + "courseName": "微积分(3)", + "courseCode": "MATH2150J", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "MATH2150J 已获学分" + }, + { + "courseId": "1BCE2420-FA95-44CE-AC02-E5E16AC3454F", + "lastUpdateDate": "2017-04-13 11:12:40", + "lastUpdateUserId": "5423DB2A-3C4D-4327-B6D5-0992FA888845", + "prerequisiteCourseId": "DC576796-3A7D-41CE-8C07-F7AF4BD3920C", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "44994CE3-A141-4E58-94CA-8B5C8AFFA3AD", + "courseName": "工程热力学(J类)(1)", + "courseCode": "ME2350J", + "option": "obtainedCredit" + } + ], + "prerequisiteRuleDesc": "ME2350J Obtained Credit" + }, + { + "courseId": "3D57A704-3FDD-4342-A683-5BCD70DE927B", + "lastUpdateDate": "2017-03-09 16:06:49", + "lastUpdateUserId": "ADMIN", + "prerequisiteCourseId": "38332DB6-E296-4BC5-B115-56F7687F0203", + "prerequisiteRule": [ + { + "type": "course", + "courseId": "447EBC94-5794-53FA-E050-A8C0190176B5", + "courseName": "计算流体力学(英文班)", + "courseCode": "EP26002", + "option": "obtainedCredit" + }, + { "type": "symbol", "symbol": "<" }, + { "type": "input", "value": "5" } + ], + "prerequisiteRuleDesc": "EP26002 Obtained Credit < 5" + } +] +``` \ No newline at end of file From ead29105185f7193abbe0eec89e9f0ae8ad2a9ce Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sat, 4 Oct 2025 20:23:28 +0800 Subject: [PATCH 13/20] refactor(spider): Separate data crawling and data importing --- .gitignore | 1 + apps/spider/crawlers/README.md | 13 +- apps/spider/crawlers/orc.py | 156 ++++--------- apps/spider/manager.py | 394 ++++++++++++++++++++++++++++++++- scripts/__init__.py | 48 ++-- 5 files changed, 466 insertions(+), 146 deletions(-) diff --git a/.gitignore b/.gitignore index 8647958..c28a32e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ data ### Python ignores (https://github.com/github/gitignore/blob/master/Python.gitignore) # Byte-compiled / optimized / DLL files __pycache__/ +data_cache/ *.py[cod] *$py.class diff --git a/apps/spider/crawlers/README.md b/apps/spider/crawlers/README.md index 00b502b..a4b2baa 100644 --- a/apps/spider/crawlers/README.md +++ b/apps/spider/crawlers/README.md @@ -7,6 +7,7 @@ - `courseId` - `courseName` - `courseNameEn` +- `courseTypeId` - `credit` - `hour` - `isCompulsory` @@ -124,7 +125,8 @@ - courseId - prerequisiteRule - prerequisiteRuleDesc -### Sample + +### Sample Response ```json [ { @@ -328,4 +330,11 @@ "prerequisiteRuleDesc": "EP26002 Obtained Credit < 5" } ] -``` \ No newline at end of file +``` + +## https://coursesel.umji.sjtu.edu.cn/tpm/findLessonTasksPreview_ElectTurn.action + +### Data Fields +- electTurn +- courseType +- lessonTask diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 9dec887..c5b98f0 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -79,125 +79,51 @@ def _ensure_initialized(self): self._initialized = True logger.info("Crawler initialized successfully!") - def get_all_courses(self, use_cache=True, save_cache=True): + def get_all_course_data(self, include_coursesel=True, include_official=True): """ Get all course data from multiple APIs and official website + Pure data extraction without user interaction Args: - use_cache: Whether to use cached data - save_cache: Whether to save data to cache + include_coursesel: Whether to include course selection system data + include_official: Whether to include official website data Returns: list: Course data with prerequisites, descriptions, and instructors """ - cache_manager = CourseDataCache() - - # If using cache, check for available cache files first - if use_cache: - cache_files = cache_manager.list_cache_files() - if cache_files: - print(f"Found {len(cache_files)} cache files") - choice = input("Use existing cache? (y/n/list): ").strip().lower() - - if choice == "list": - # Show cache file list for selection - from apps.spider.manager import interactive_cache_manager - - selected_file = interactive_cache_manager() - if selected_file: - print(f"Loading cache file: {selected_file.name}") - return cache_manager.load_from_jsonl(selected_file) - elif choice in ["y", "yes"]: - # Use the latest cache file - latest_file = cache_files[0] - print(f"Loading latest cache: {latest_file.name}") - return cache_manager.load_from_jsonl(latest_file) - - # Ask user to choose data sources - use_coursesel = self._ask_user_choice( - "Crawl course selection system data? (y/n): ", default="n" - ) - use_official = self._ask_user_choice( - "Crawl official website data? (y/n): ", default="y" - ) - courses_data = [] course_details = {} prerequisites = {} official_data = {} - if use_coursesel: + if include_coursesel: self._ensure_initialized() # Make sure crawler is initialized - print("🌐 爬取课程选择系统数据...") + logger.info("Crawling course selection system data...") # Get data from course selection APIs courses_data = self._get_lesson_tasks() course_details = self._get_course_catalog() prerequisites = self._get_prerequisites() else: - print("⏭️ 跳过课程选择系统数据") + logger.info("Skipping course selection system data") - if use_official: - print("🌐 爬取官网数据...") + if include_official: + logger.info("Crawling official website data...") # Get official website data for enhanced descriptions official_data = self._get_official_website_data() else: - print("Skipping official website data") + logger.info("Skipping official website data") # Integrate data integrated_data = self._integrate_course_data( courses_data, course_details, prerequisites, official_data ) - print( - f"DEBUG: Integrated data count: {len(integrated_data) if integrated_data else 0}" + logger.info( + f"Integrated data count: {len(integrated_data) if integrated_data else 0}" ) - print(f"DEBUG: Save cache enabled: {save_cache}") - - # Save to cache - if save_cache and integrated_data: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - data_sources = [] - if use_coursesel: - data_sources.append("coursesel") - if use_official: - data_sources.append("official") - - cache_filename = f"courses_{'_'.join(data_sources)}" - cache_filepath = cache_manager.save_to_jsonl( - integrated_data, cache_filename, timestamp - ) - - print(f"Data cached to: {cache_filepath}") - - # Ask whether to import to database immediately - from apps.spider.manager import preview_data_before_import - - if preview_data_before_import(cache_filepath, limit=5): - print("Starting database import...") - try: - import_department(integrated_data) - print("Data import successful!") - except Exception as e: - print(f"Data import failed: {str(e)}") - print("Data saved to cache, can be imported manually later") - else: - print("Skipping database import, data saved to cache") return integrated_data - def _ask_user_choice(self, prompt, default="y"): - """Ask user for yes/no choice with default value""" - while True: - response = input(prompt).strip().lower() - if not response: - response = default.lower() - if response in ["y", "yes", "true"]: - return True - elif response in ["n", "no", "false"]: - return False - else: - print("Please enter y/yes or n/no") - def _get_current_elect_turn_id(self): """Get current election turn ID dynamically""" url = f"{BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action" @@ -282,13 +208,29 @@ def _get_course_catalog(self): except Exception: return {} + def _get_prerequisites(self): + """Get prerequisite data with course requirements and logic""" + url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" + def _get_prerequisites(self): """Get prerequisite data with course requirements and logic""" url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" try: - response = self.session.post(url, params={"_t": int(time.time() * 1000)}) + logger.info(f"Requesting Prerequisites API: {url}") + logger.info(f"Session cookies: {dict(self.session.cookies)}") + response = self.session.post(url, json={}) + logger.info(f"Response status: {response.status_code}") + logger.info(f"Response headers: {dict(response.headers)}") + logger.info(f"Response content length: {len(response.content)}") + logger.info(f"Response content (first 500 chars): {response.text[:500]}") + response.raise_for_status() + + if not response.text.strip(): + logger.warning("Prerequisites API returned empty response") + return {} + data = response.json() logger.debug(f"Prerequisites API response: success={data.get('success')}") @@ -503,7 +445,7 @@ def _parse_official_course_html(self, html_content, course_url): print(f"DEBUG: Empty h2 text in {course_url}") return None - split_course_heading = course_heading_text.split(" – ") + split_course_heading = course_heading_text.split(" – ") # Using em dash if len(split_course_heading) < 2: print( f"DEBUG: Invalid heading format '{course_heading_text}' in {course_url}" @@ -558,7 +500,7 @@ def _parse_official_course_html(self, html_content, course_url): description = line elif in_topics and line: # Simple topic extraction - clean_line = line.lstrip("•-*").strip() + clean_line = line.lstrip("-*").strip() if clean_line and len(course_topics) < 10: # Limit for performance course_topics.append(clean_line) @@ -700,7 +642,7 @@ def _merge_course_sections(self, course_list): for course in course_list: teachers = course.get("lessonTaskTeam", "") if teachers: - for teacher in re.split(r"[,;,;、]", teachers): + for teacher in re.split(r"[,;]", teachers): if teacher.strip(): all_instructors.add(teacher.strip()) @@ -826,21 +768,9 @@ def _normalize_prerequisites_to_english(self, prerequisites_text): if not prerequisites_text: return "" - # Define translation mapping - translations = { - "已获学分": "Obtained Credit", - "已提交学分": "Credits Submitted", - "获得学分": "Obtained Credit", - "提交学分": "Credits Submitted", - "学分": "Credit", - } - - # Apply translations - normalized = prerequisites_text - for chinese, english in translations.items(): - normalized = normalized.replace(chinese, english) - - return normalized + # For now, return as-is since we removed Chinese translations + # Can be enhanced later to handle specific text transformations + return prerequisites_text def _extract_description(self, official_data=None): """Extract course description (only from official website)""" @@ -860,7 +790,7 @@ def _extract_instructors(self, main_data, catalog_data): teacher_name = catalog_data.get("teacherName", "") if teacher_name: - for teacher in re.split(r"[,;,;、]", teacher_name): + for teacher in re.split(r"[,;]", teacher_name): if teacher.strip() and teacher.strip() not in instructors: instructors.append(teacher.strip()) @@ -891,7 +821,7 @@ def crawl_program_urls(): global _course_data_cache crawler = _get_crawler() - courses = crawler.get_all_courses() + courses = crawler.get_all_course_data() course_urls = [] _course_data_cache = {} # Reset cache @@ -922,7 +852,7 @@ def import_department(department_data): for course_data in department_data: try: - # 验证必要字段 + # Validate required fields required_fields = ["course_code", "course_title"] missing_fields = [ field for field in required_fields if not course_data.get(field) @@ -935,7 +865,7 @@ def import_department(department_data): error_count += 1 continue - # 准备默认值,处理可能缺失的字段 + # Prepare default values, handle potentially missing fields defaults = { "course_title": course_data.get("course_title", ""), "department": course_data.get("department", ""), @@ -947,19 +877,19 @@ def import_department(department_data): "url": course_data.get("url", ""), } - # 注意:official_url 字段不存在于Course模型中,所以不包含它 + # Note: official_url field does not exist in Course model, so it's not included - # 创建或更新课程 + # Create or update course course, created = Course.objects.update_or_create( course_code=course_data["course_code"], defaults=defaults, ) - # 处理教师信息 + # Handle instructor information instructors = course_data.get("instructors", []) if instructors: for instructor_name in instructors: - if instructor_name.strip(): # 确保教师名字不为空 + if instructor_name.strip(): # Ensure instructor name is not empty try: instructor, _ = Instructor.objects.get_or_create( name=instructor_name.strip() diff --git a/apps/spider/manager.py b/apps/spider/manager.py index 9af7151..b84aa71 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -27,12 +27,9 @@ def __init__(self, cache_dir=None): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) - def save_to_jsonl(self, data, data_type, timestamp=None): - """Save data to jsonl file""" - if timestamp is None: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - - filename = f"{data_type}_{timestamp}.jsonl" + def save_to_jsonl(self, data, data_type): + """Save data to jsonl file with overwrite (no timestamp)""" + filename = f"{data_type}.jsonl" filepath = self.cache_dir / filename print(f"Saving data to: {filepath}") @@ -50,6 +47,38 @@ def save_to_jsonl(self, data, data_type, timestamp=None): print(f"Data saved to: {filepath}") return filepath + def save_coursesel_data(self, lesson_tasks, course_catalog, prerequisites): + """Save course selection system data to separate jsonl files with overwrite""" + saved_files = {} + + # Save lesson tasks data + if lesson_tasks: + filepath = self.save_to_jsonl(lesson_tasks, "coursesel_lesson_tasks") + saved_files["lesson_tasks"] = filepath + + # Save course catalog data + if course_catalog: + # Convert dict to list for consistent format + catalog_list = ( + list(course_catalog.values()) + if isinstance(course_catalog, dict) + else course_catalog + ) + filepath = self.save_to_jsonl(catalog_list, "coursesel_course_catalog") + saved_files["course_catalog"] = filepath + + # Save prerequisites data + if prerequisites: + # Convert defaultdict to regular dict, then to list + prereq_list = [] + for course_id, prereq_items in prerequisites.items(): + for item in prereq_items: + prereq_list.append(item) + filepath = self.save_to_jsonl(prereq_list, "coursesel_prerequisites") + saved_files["prerequisites"] = filepath + + return saved_files + def load_from_jsonl(self, filepath): """Load data from jsonl file""" data = [] @@ -334,9 +363,360 @@ def clean_cache(): print("Cleanup cancelled") +def interactive_spider_manager(): + """Interactive spider management system""" + import asyncio + from apps.spider.crawlers.orc import CourseSelCrawler + + print("=" * 60) + print("Interactive Spider Management System") + print("=" * 60) + + cache = CourseDataCache() + + while True: + print("\n1. Crawl data from websites") + print("2. Import data from cache files") + print("3. View cache files") + print("4. Clean cache files") + print("5. Exit") + + choice = input("\nSelect option (1-5): ").strip() + + if choice == "1": + # Crawling workflow + crawl_workflow(cache) + elif choice == "2": + # Import workflow + import_workflow(cache) + elif choice == "3": + # View cache files + interactive_cache_manager() + elif choice == "4": + # Clean cache files + clean_cache() + elif choice == "5": + print("Exiting...") + break + else: + print("Invalid choice, please try again") + + +def crawl_workflow(cache): + """Crawling workflow""" + import asyncio + from apps.spider.crawlers.orc import CourseSelCrawler + + print("\n" + "=" * 40) + print("Data Crawling Workflow") + print("=" * 40) + + print("\nAvailable data sources:") + print("1. Course Selection System (coursesel.umji.sjtu.edu.cn)") + print("2. Official Website (ji.sjtu.edu.cn)") + print("3. Both") + + source_choice = input("\nSelect data source (1-3): ").strip() + + if source_choice == "1": + crawl_coursesel_workflow(cache) + elif source_choice == "2": + crawl_official_workflow(cache) + elif source_choice == "3": + crawl_coursesel_workflow(cache) + print("\n" + "-" * 40) + crawl_official_workflow(cache) + else: + print("Invalid choice") + + +def crawl_coursesel_workflow(cache): + """Course selection system crawling workflow""" + import asyncio + from apps.spider.crawlers.orc import CourseSelCrawler + + print("\n" + "=" * 40) + print("Course Selection System Crawling") + print("=" * 40) + + # Request JSESSIONID first since all coursesel APIs need authentication + print("\nCourse Selection System requires authentication.") + print("Please enter your JSESSIONID cookie:") + print("(Found in browser dev tools under Network or Application tabs)") + jsessionid = input("JSESSIONID: ").strip() + + if not jsessionid: + print("JSESSIONID cannot be empty. Cancelling course selection crawling.") + return + + print("\nAvailable APIs:") + print("1. Lesson Tasks") + print("2. Course Catalog") + print("3. Prerequisites") + print("4. All APIs") + print("\nYou can select multiple APIs by entering numbers separated by commas") + print("For example: 1,2 for Lesson Tasks and Course Catalog") + + api_choice = input( + "\nSelect APIs to crawl (1-4 or combinations like 1,2,3): " + ).strip() + + # Parse user input to determine which APIs to crawl + selected_apis = set() + + if api_choice == "4": + selected_apis = {"1", "2", "3"} + else: + # Split by comma (both English and Chinese commas) and clean up + api_choice = api_choice.replace( + ",", "," + ) # Replace Chinese comma with English comma + choices = [choice.strip() for choice in api_choice.split(",")] + for choice in choices: + if choice in ["1", "2", "3"]: + selected_apis.add(choice) + else: + print(f"Invalid choice '{choice}', skipping...") + + if not selected_apis: + print("No valid APIs selected") + return + + print(f"\nSelected APIs: {', '.join(sorted(selected_apis))}") + + # Initialize crawler with JSESSIONID + crawler = CourseSelCrawler(jsessionid=jsessionid) + crawler._ensure_initialized() + + lesson_tasks = None + course_catalog = None + prerequisites = None + + try: + if "1" in selected_apis: + print("\n[*] Crawling Lesson Tasks...") + lesson_tasks = crawler._get_lesson_tasks() + print(f"[+] Retrieved {len(lesson_tasks)} lesson tasks") + + if "2" in selected_apis: + print("\n[*] Crawling Course Catalog...") + course_catalog = crawler._get_course_catalog() + print(f"[+] Retrieved {len(course_catalog)} courses from catalog") + + if "3" in selected_apis: + print("\n[*] Crawling Prerequisites...") + prerequisites = crawler._get_prerequisites() + print(f"[+] Retrieved prerequisites for {len(prerequisites)} courses") + + # Save data to separate jsonl files + saved_files = cache.save_coursesel_data( + lesson_tasks, course_catalog, prerequisites + ) + + print(f"\n[+] Successfully saved {len(saved_files)} files:") + for data_type, filepath in saved_files.items(): + print(f" - {data_type}: {Path(filepath).name}") + + except Exception as e: + print(f"[-] Crawling failed: {str(e)}") + import traceback + + traceback.print_exc() + + +def crawl_official_workflow(cache): + """Official website crawling workflow""" + import asyncio + from apps.spider.crawlers.orc import CourseSelCrawler + + print("\n" + "=" * 40) + print("Official Website Crawling") + print("=" * 40) + + print("\n[*] Crawling official website data...") + + crawler = CourseSelCrawler() + + try: + # Get official website data (async) + official_data = asyncio.run(crawler._get_official_website_data_async()) + print(f"[+] Retrieved {len(official_data)} courses from official website") + + # Convert to list format for saving + official_list = [] + for course_code, course_info in official_data.items(): + course_info["course_code"] = course_code + official_list.append(course_info) + + # Save to jsonl file + filepath = cache.save_to_jsonl(official_list, "official") + print(f"[+] Successfully saved to: {Path(filepath).name}") + + except Exception as e: + print(f"[-] Official website crawling failed: {str(e)}") + import traceback + + traceback.print_exc() + + +def import_workflow(cache): + """Data import workflow""" + print("\n" + "=" * 40) + print("Data Import Workflow") + print("=" * 40) + + files = cache.list_cache_files() + + if not files: + print("No cache files found. Please crawl data first.") + return + + print(f"\nFound {len(files)} cache files:") + + # Group files by type + file_groups = {} + for i, filepath in enumerate(files): + filename = filepath.name + if "coursesel_lesson_tasks" in filename: + file_type = "Lesson Tasks" + elif "coursesel_course_catalog" in filename: + file_type = "Course Catalog" + elif "coursesel_prerequisites" in filename: + file_type = "Prerequisites" + elif "official" in filename: + file_type = "Official Website" + else: + file_type = "Integrated" + + if file_type not in file_groups: + file_groups[file_type] = [] + file_groups[file_type].append((i, filepath)) + + # Display grouped files + for file_type, file_list in file_groups.items(): + print(f"\n{file_type}:") + for i, filepath in file_list: + info = cache.get_cache_info(filepath) + print( + f" {i + 1:2d}. {info['filename']} ({info['size']}, {info['count']} records, {info['modified']})" + ) + + print(f"\n{len(files) + 1}. Import and integrate data") + + choice = input(f"\nSelect file to view or import (1-{len(files) + 1}): ").strip() + + try: + choice_num = int(choice) + if 1 <= choice_num <= len(files): + # View file details + filepath = files[choice_num - 1] + info = cache.get_cache_info(filepath) + print(f"\nFile details: {info['filename']}") + print(f"Size: {info['size']}") + print(f"Records: {info['count']}") + print(f"Modified: {info['modified']}") + print("Preview:") + for i, item in enumerate(info["preview"]): + print( + f" Record {i + 1}: {json.dumps(item, ensure_ascii=False, indent=2)[:200]}..." + ) + + elif choice_num == len(files) + 1: + # Import and integrate data + integrate_and_import_data(cache) + else: + print("Invalid choice") + + except ValueError: + print("Invalid input") + + +def integrate_and_import_data(cache): + """Integrate data from multiple cache files and import to database""" + from apps.spider.crawlers.orc import CourseSelCrawler + + print("\n" + "=" * 40) + print("Data Integration and Import") + print("=" * 40) + + files = cache.list_cache_files() + + # Load the most recent files of each type + lesson_tasks_data = [] + course_catalog_data = {} + prerequisites_data = {} + official_data = {} + + print("\n[*] Loading cache files...") + + for filepath in files: + filename = filepath.name + data = cache.load_from_jsonl(filepath) + + if "coursesel_lesson_tasks" in filename: + lesson_tasks_data = data + print(f"[+] Loaded lesson tasks: {len(data)} records") + elif "coursesel_course_catalog" in filename: + # Convert list back to dict + course_catalog_data = { + item.get("courseId"): item for item in data if item.get("courseId") + } + print(f"[+] Loaded course catalog: {len(data)} records") + elif "coursesel_prerequisites" in filename: + # Group prerequisites by courseId + from collections import defaultdict + + prerequisites_data = defaultdict(list) + for item in data: + course_id = item.get("courseId") + if course_id: + prerequisites_data[course_id].append(item) + print(f"[+] Loaded prerequisites: {len(data)} records") + elif "official" in filename: + # Convert list back to dict + for item in data: + course_code = item.get("course_code") + if course_code: + official_data[course_code] = item + print(f"[+] Loaded official data: {len(data)} records") + + if not any( + [lesson_tasks_data, course_catalog_data, prerequisites_data, official_data] + ): + print("[-] No valid data found to integrate") + return + + print("\n[*] Integrating data...") + + # Use the crawler's integration logic + crawler = CourseSelCrawler() + integrated_data = crawler._integrate_course_data( + lesson_tasks_data, course_catalog_data, prerequisites_data, official_data + ) + + print(f"[+] Integrated {len(integrated_data)} course records") + + # Save integrated data + integrated_filepath = cache.save_to_jsonl(integrated_data, "integrated") + print(f"[+] Saved integrated data to: {Path(integrated_filepath).name}") + + # Ask user if they want to import to database + import_choice = input("\nImport to database? (y/n): ").strip().lower() + if import_choice in ["y", "yes"]: + try: + print("\n[*] Importing to database...") + # Here you would call the actual database import function + # For now, just print the action + print( + "[+] Data import completed (placeholder - implement actual database import)" + ) + except Exception as e: + print(f"[-] Database import failed: {str(e)}") + + if __name__ == "__main__": try: - main_menu() + interactive_spider_manager() except KeyboardInterrupt: print("\nProgram exited") except Exception as e: diff --git a/scripts/__init__.py b/scripts/__init__.py index d73d887..9741121 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -11,31 +11,31 @@ def crawl_and_import_data(): - old_task_always_eager = app.conf.task_always_eager - app.conf.task_always_eager = True - - # ORC crawling takes a long time, especially when run synchronously. - # If the ORC is not crawled, the course selection will only be limited, - # but this should not interfere with development - print("Crawling ORC. This will take a while.") - crawl_orc() - - # print("Crawling timetable") - # crawl_timetable() - - # print("Crawling medians") - # crawl_medians() - - print("Importing ORC") - _import_crawled_datas(CrawledData.ORC_DEPARTMENT_COURSES) - - # print("Importing timetable") - # _import_crawled_datas(CrawledData.COURSE_TIMETABLE) - - # print("Importing medians") - # _import_crawled_datas(CrawledData.MEDIANS) + """ + Interactive course data crawling and import using the new manager system - app.conf.task_always_eager = old_task_always_eager + This function provides an interactive workflow for: + 1. Crawling course selection system data (with JSESSIONID authentication) + 2. Crawling official website data + 3. Integrating and importing data to database + """ + print("=" * 60) + print("Interactive Course Data Crawling and Import") + print("=" * 60) + print("Using the new unified spider management system...") + print() + + # Import and use the new manager system + from apps.spider.manager import interactive_spider_manager + + try: + # Launch the interactive spider manager + interactive_spider_manager() + except KeyboardInterrupt: + print("\nCrawl and import cancelled by user") + except Exception as e: + print(f"Error during crawl and import: {str(e)}") + raise def _import_crawled_datas(data_type): From 1c27ad02477904a1a9b73e4812dc688dba88a409 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sat, 4 Oct 2025 22:09:06 +0800 Subject: [PATCH 14/20] perf(spider): improve data integration --- apps/spider/crawlers/orc.py | 237 +++++++++++++++++++++++++----------- apps/spider/manager.py | 76 +++++++++--- 2 files changed, 227 insertions(+), 86 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index c5b98f0..5c43174 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -208,10 +208,6 @@ def _get_course_catalog(self): except Exception: return {} - def _get_prerequisites(self): - """Get prerequisite data with course requirements and logic""" - url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" - def _get_prerequisites(self): """Get prerequisite data with course requirements and logic""" url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" @@ -562,69 +558,111 @@ def _is_official_course_url(self, candidate_url): return candidate_url.startswith(OFFICIAL_COURSE_DETAIL_URL_PREFIX) def _integrate_course_data( - self, courses_data, course_details, prerequisites, official_data=None + self, + lesson_tasks_data, + course_catalog_data, + prerequisites_data, + official_data=None, ): - """Integrate course data from multiple sources""" + """Integrate course data with course catalog as primary source""" if official_data is None: official_data = {} logger.info( - f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups, {len(official_data)} official records" + f"Starting integration with {len(lesson_tasks_data)} lesson tasks, {len(course_catalog_data)} catalog courses, {len(prerequisites_data)} prereq groups, {len(official_data)} official records" ) integrated_courses = [] courses_with_prereqs = 0 - # If we have course selection data, process it - if courses_data and len(courses_data) > 0: - courses_by_code = defaultdict(list) - for course in courses_data: + # Create index of lesson tasks by course code + lesson_tasks_by_code = defaultdict(list) + if lesson_tasks_data: + for course in lesson_tasks_data: course_code = course.get("courseCode") if course_code: - courses_by_code[course_code].append(course) + lesson_tasks_by_code[course_code].append(course) - for course_code, course_list in courses_by_code.items(): - merged = self._merge_course_sections(course_list) - if not merged: - continue - - course_id = merged.get("courseId") - catalog_info = course_details.get(course_id, {}) - prereq_info = prerequisites.get(course_id, []) - official_info = official_data.get(course_code, {}) - - if prereq_info: - courses_with_prereqs += 1 - logger.debug( - f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs" - ) - - course_data = self._build_course_record( - course_code, merged, catalog_info, prereq_info, official_info - ) - - if course_data: - integrated_courses.append(course_data) - - # If we only have official data (no course selection data), create courses from official data - if (not courses_data or len(courses_data) == 0) and official_data: - logger.info("Creating courses from official website data only") + # Create index of official data by course code + official_by_code = {} + if official_data: for course_code, official_info in official_data.items(): - # Create empty main_data for courses that only exist in official website - empty_main_data = {} - empty_catalog_data = {} - empty_prereq_data = [] - - course_data = self._build_course_record( - course_code, - empty_main_data, - empty_catalog_data, - empty_prereq_data, - official_info, + official_by_code[course_code] = official_info + + # Create index of prerequisites by course code + prereq_by_code = {} + if prerequisites_data: + for course_id, prereq_list in prerequisites_data.items(): + # Find course code for this course_id from catalog + if isinstance(course_catalog_data, dict): + catalog_info = course_catalog_data.get(course_id) + if catalog_info: + course_code = catalog_info.get("courseCode") + if course_code: + prereq_by_code[course_code] = prereq_list + + # Get all course codes from course catalog (primary source) + all_course_codes = set() + if isinstance(course_catalog_data, dict): + # If course_catalog_data is dict {courseId: course_info} + for course_id, catalog_info in course_catalog_data.items(): + course_code = catalog_info.get("courseCode") + if course_code: + all_course_codes.add(course_code) + elif isinstance(course_catalog_data, list): + # If course_catalog_data is list [course_info, ...] + for catalog_info in course_catalog_data: + course_code = catalog_info.get("courseCode") + if course_code: + all_course_codes.add(course_code) + + # Add course codes that only exist in official data + all_course_codes.update(official_by_code.keys()) + + logger.info(f"Processing {len(all_course_codes)} unique course codes") + + # Process each course code + for course_code in all_course_codes: + # Get catalog info for this course code + catalog_info = {} + if isinstance(course_catalog_data, dict): + for course_id, info in course_catalog_data.items(): + if info.get("courseCode") == course_code: + catalog_info = info + break + elif isinstance(course_catalog_data, list): + for info in course_catalog_data: + if info.get("courseCode") == course_code: + catalog_info = info + break + + # Get data from other sources + lesson_tasks_list = lesson_tasks_by_code.get(course_code, []) + official_info = official_by_code.get(course_code, {}) + prereq_info = prereq_by_code.get(course_code, []) + + # Merge lesson tasks sections if available + merged_lesson_tasks = {} + if lesson_tasks_list: + merged_lesson_tasks = self._merge_course_sections(lesson_tasks_list) + + if prereq_info: + courses_with_prereqs += 1 + logger.debug( + f"Course {course_code} has {len(prereq_info)} prerequisites" ) - if course_data: - integrated_courses.append(course_data) + # Build course record (catalog data as base, supplemented by others) + course_data = self._build_course_record( + course_code, + merged_lesson_tasks, + catalog_info, + prereq_info, + official_info, + ) + + if course_data: + integrated_courses.append(course_data) logger.info( f"Integration complete: {courses_with_prereqs} courses have prerequisites, {len(integrated_courses)} total courses" @@ -652,7 +690,7 @@ def _merge_course_sections(self, course_list): def _build_course_record( self, course_code, main_data, catalog_data, prereq_data, official_data=None ): - """Build standardized course record with official website data integration""" + """Build standardized course record with official website data as primary source""" if official_data is None: official_data = {} @@ -665,11 +703,15 @@ def _build_course_record( department, number = self._parse_course_code(course_code) course_credits = self._extract_course_credits(main_data, catalog_data) prerequisites = self._build_prerequisites_string(course_code, prereq_data) - description = self._extract_description(official_data) + + # Description and topics only from official data + # If course only exists in course selection system, these will be empty + description = self._extract_description(official_data) if official_data else "" + course_topics = official_data.get("course_topics", []) if official_data else [] + official_url = official_data.get("official_url", "") if official_data else "" + + # Instructors from course selection data (more current) instructors = self._extract_instructors(main_data, catalog_data) - # Get course topics and official URL from official website data - course_topics = official_data.get("course_topics", []) - official_url = official_data.get("official_url", "") # Use official URL as primary URL, fallback to API URL if not available course_url = official_url or self._build_course_url(main_data) @@ -681,11 +723,11 @@ def _build_course_record( "number": number, "course_credits": course_credits, "pre_requisites": prerequisites, - "description": description, - "course_topics": course_topics, + "description": description, # Empty if only coursesel data + "course_topics": course_topics, # Empty if only coursesel data "instructors": instructors, "url": course_url, - "official_url": official_url, + "official_url": official_url, # Empty if only coursesel data } def _extract_course_title(self, main_data, catalog_data, official_data=None): @@ -710,11 +752,33 @@ def _parse_course_code(self, course_code): number = 0 if course_code: - # Match DEPT###(#)?J? (3 or 4 digits, J is optional) - match = re.match(r"^([A-Z]{2,4})(\d{3,4})J?$", course_code) + # Convert to uppercase for consistent matching + code_upper = course_code.upper() + + # Try standard format first: DEPT###(#)?J? (3 or 4 digits, J is optional) + match = re.match(r"^([A-Z]{2,4})(\d{3,4})J?$", code_upper) if match: department = match.group(1) number = int(match.group(2)) + else: + # Try alternative formats for course codes that don't follow standard pattern + # Format: Letter(s) + Numbers (e.g., C032710, F034546, X413515) + alt_match = re.match(r"^([A-Z]+)(\d+)$", code_upper) + if alt_match: + department = alt_match.group(1) + try: + number = int(alt_match.group(2)) + except ValueError: + number = 0 + else: + # For complex codes like "VE507(5002)", extract the main part + complex_match = re.match(r"^([A-Z]{2,4})(\d{3,4})", code_upper) + if complex_match: + department = complex_match.group(1) + try: + number = int(complex_match.group(2)) + except ValueError: + number = 0 return department, number @@ -768,9 +832,26 @@ def _normalize_prerequisites_to_english(self, prerequisites_text): if not prerequisites_text: return "" - # For now, return as-is since we removed Chinese translations - # Can be enhanced later to handle specific text transformations - return prerequisites_text + # Dictionary for Chinese to English translations + translations = { + "已获学分": "Obtained Credit", + "已提交学分": "Credits Submitted", + "学分": "Credit", + "先修": "Prerequisite", + "课程": "Course", + "或": "or", + "且": "and", + "以上": "above", + "学期": "Semester", + "年级": "Grade", + } + + # Apply translations + normalized_text = prerequisites_text + for chinese_term, english_term in translations.items(): + normalized_text = normalized_text.replace(chinese_term, english_term) + + return normalized_text def _extract_description(self, official_data=None): """Extract course description (only from official website)""" @@ -786,13 +867,33 @@ def _extract_instructors(self, main_data, catalog_data): if catalog_data is None: catalog_data = {} - instructors = main_data.get("all_instructors", []) - teacher_name = catalog_data.get("teacherName", "") + instructors = [] + + # Extract from lesson tasks data (main_data) + # Check for lessonTaskTeam field (string format) + lesson_task_team = main_data.get("lessonTaskTeam", "").strip() + if lesson_task_team: + instructors.append(lesson_task_team) + + # Check for firstSpeakerName field + first_speaker = main_data.get("firstSpeakerName", "").strip() + if first_speaker and first_speaker not in instructors: + instructors.append(first_speaker) + + # Check for all_instructors field (backward compatibility) + all_instructors = main_data.get("all_instructors", []) + if isinstance(all_instructors, list): + for instructor in all_instructors: + if instructor.strip() and instructor.strip() not in instructors: + instructors.append(instructor.strip()) + # Extract from catalog data (teacherName field) + teacher_name = catalog_data.get("teacherName", "").strip() if teacher_name: for teacher in re.split(r"[,;]", teacher_name): - if teacher.strip() and teacher.strip() not in instructors: - instructors.append(teacher.strip()) + teacher = teacher.strip() + if teacher and teacher not in instructors: + instructors.append(teacher) return instructors diff --git a/apps/spider/manager.py b/apps/spider/manager.py index b84aa71..e61c0ce 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -27,6 +27,20 @@ def __init__(self, cache_dir=None): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) + def save_to_json(self, data, data_type): + """Save data to json file with overwrite (no timestamp)""" + filename = f"{data_type}.json" + filepath = self.cache_dir / filename + + print(f"Saving data to: {filepath}") + print(f"Data count: {len(data) if isinstance(data, list) else 1}") + + with open(filepath, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"Data saved to: {filepath}") + return filepath + def save_to_jsonl(self, data, data_type): """Save data to jsonl file with overwrite (no timestamp)""" filename = f"{data_type}.jsonl" @@ -48,15 +62,15 @@ def save_to_jsonl(self, data, data_type): return filepath def save_coursesel_data(self, lesson_tasks, course_catalog, prerequisites): - """Save course selection system data to separate jsonl files with overwrite""" + """Save course selection system data to separate files with overwrite""" saved_files = {} - # Save lesson tasks data + # Save lesson tasks data as JSON if lesson_tasks: - filepath = self.save_to_jsonl(lesson_tasks, "coursesel_lesson_tasks") + filepath = self.save_to_json(lesson_tasks, "coursesel_lesson_tasks") saved_files["lesson_tasks"] = filepath - # Save course catalog data + # Save course catalog data as JSONL if course_catalog: # Convert dict to list for consistent format catalog_list = ( @@ -67,7 +81,7 @@ def save_coursesel_data(self, lesson_tasks, course_catalog, prerequisites): filepath = self.save_to_jsonl(catalog_list, "coursesel_course_catalog") saved_files["course_catalog"] = filepath - # Save prerequisites data + # Save prerequisites data as JSONL if prerequisites: # Convert defaultdict to regular dict, then to list prereq_list = [] @@ -79,6 +93,11 @@ def save_coursesel_data(self, lesson_tasks, course_catalog, prerequisites): return saved_files + def load_from_json(self, filepath): + """Load data from json file""" + with open(filepath, "r", encoding="utf-8") as f: + return json.load(f) + def load_from_jsonl(self, filepath): """Load data from jsonl file""" data = [] @@ -88,16 +107,27 @@ def load_from_jsonl(self, filepath): data.append(json.loads(line)) return data + def load_data_file(self, filepath): + """Load data from either JSON or JSONL file based on extension""" + if filepath.suffix.lower() == ".json": + return self.load_from_json(filepath) + elif filepath.suffix.lower() == ".jsonl": + return self.load_from_jsonl(filepath) + else: + raise ValueError(f"Unsupported file format: {filepath.suffix}") + def list_cache_files(self): - """List all cache files""" - files = list(self.cache_dir.glob("*.jsonl")) + """List all cache files (both .json and .jsonl)""" + files = list(self.cache_dir.glob("*.jsonl")) + list( + self.cache_dir.glob("*.json") + ) files.sort(key=lambda x: x.stat().st_mtime, reverse=True) return files def get_cache_info(self, filepath): """Get cache file information""" stat = filepath.stat() - data = self.load_from_jsonl(filepath) + data = self.load_data_file(filepath) return { "filename": filepath.name, @@ -172,7 +202,7 @@ def interactive_cache_manager(): def preview_data_before_import(filepath, limit=10): """Preview data before import""" cache = CourseDataCache() - data = cache.load_from_jsonl(filepath) + data = cache.load_data_file(filepath) print("=" * 60) print(f"Data Preview ({filepath.name})") @@ -309,7 +339,7 @@ def import_from_cache(): try: cache = CourseDataCache() - data = cache.load_from_jsonl(selected_file) + data = cache.load_data_file(selected_file) # Preview and confirm import if preview_data_before_import(selected_file, limit=10): @@ -651,11 +681,17 @@ def integrate_and_import_data(cache): for filepath in files: filename = filepath.name - data = cache.load_from_jsonl(filepath) + data = cache.load_data_file(filepath) if "coursesel_lesson_tasks" in filename: - lesson_tasks_data = data - print(f"[+] Loaded lesson tasks: {len(data)} records") + # Handle JSON format with nested structure + if isinstance(data, dict) and "data" in data: + lesson_tasks_data = data["data"].get("lessonTasks", []) + elif isinstance(data, list): + lesson_tasks_data = data + else: + lesson_tasks_data = [] + print(f"[+] Loaded lesson tasks: {len(lesson_tasks_data)} records") elif "coursesel_course_catalog" in filename: # Convert list back to dict course_catalog_data = { @@ -705,11 +741,15 @@ def integrate_and_import_data(cache): if import_choice in ["y", "yes"]: try: print("\n[*] Importing to database...") - # Here you would call the actual database import function - # For now, just print the action - print( - "[+] Data import completed (placeholder - implement actual database import)" - ) + # Use the actual database import function + from apps.spider.crawlers.orc import import_department + + result = import_department(integrated_data) + + print(f"[+] Database import completed!") + print(f" Success: {result['success']} courses") + print(f" Errors: {result['errors']} courses") + print(f" Total processed: {result['success'] + result['errors']}") except Exception as e: print(f"[-] Database import failed: {str(e)}") From c6f71b90370c2a4ad8ba2dbd0d40841ac11aabe7 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 02:44:14 +0800 Subject: [PATCH 15/20] refactor(crawlers): decompose some functions into different classes --- apps/spider/README.md | 0 apps/spider/crawlers/README.md | 8 +- apps/spider/crawlers/orc.py | 1053 ++++++++++++++++++-------------- apps/spider/manager.py | 521 +++++++++++----- apps/spider/utils.py | 13 + 5 files changed, 978 insertions(+), 617 deletions(-) create mode 100644 apps/spider/README.md diff --git a/apps/spider/README.md b/apps/spider/README.md new file mode 100644 index 0000000..e69de29 diff --git a/apps/spider/crawlers/README.md b/apps/spider/crawlers/README.md index a4b2baa..7602113 100644 --- a/apps/spider/crawlers/README.md +++ b/apps/spider/crawlers/README.md @@ -335,6 +335,8 @@ ## https://coursesel.umji.sjtu.edu.cn/tpm/findLessonTasksPreview_ElectTurn.action ### Data Fields -- electTurn -- courseType -- lessonTask +- electTurn(uniquely identifying this selection round) +- courseType(maybe useful) +- lessonTask(courseCode,faculty,maxNum,termIdm,termName) +- lessonCalenderConflict(maybe useful) +- lessonCalenderWeek(couseId) \ No newline at end of file diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index 5c43174..fa2ef99 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -19,38 +19,102 @@ # Set up logger logger = logging.getLogger(__name__) -# API endpoints for course selection system -BASE_URL = "https://coursesel.umji.sjtu.edu.cn" -COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") -# Official website endpoints for detailed course info -OFFICIAL_BASE_URL = "https://www.ji.sjtu.edu.cn/" -OFFICIAL_ORC_BASE_URL = urljoin( - OFFICIAL_BASE_URL, "/academics/courses/courses-by-number/" -) -OFFICIAL_COURSE_DETAIL_URL_PREFIX = ( - "https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id=" -) +class CrawlerConfig: + """ + Configuration class for all crawler components + + Contains URLs, timeouts, limits, and other configurable parameters + used across different crawler classes. + """ + + # Course Selection System API Configuration + COURSESEL_BASE_URL = "https://coursesel.umji.sjtu.edu.cn" + COURSESEL_APIS = ["lesson_tasks", "course_catalog", "prerequisites"] + + # Official Website Configuration + OFFICIAL_BASE_URL = "https://www.ji.sjtu.edu.cn/" + OFFICIAL_ORC_BASE_URL = ( + "https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/" + ) + OFFICIAL_COURSE_DETAIL_URL_PREFIX = "https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id=" + + # Request Configuration + REQUEST_TIMEOUT = 30 + CONNECT_TIMEOUT = 3 + READ_TIMEOUT = 10 + MAX_RETRIES = 2 + RETRY_DELAY = 1.0 + + # Concurrency Configuration + MAX_CONNECTIONS = 50 + CONNECTIONS_PER_HOST = 25 + CONCURRENT_REQUESTS = 20 + READ_BUFFER_SIZE = 32768 + + # DNS and Connection Configuration + DNS_CACHE_TTL = 600 + KEEPALIVE_TIMEOUT = 30 + + # Data Processing Configuration + DEFAULT_BATCH_SIZE = 100 + MAX_COURSE_TOPICS = 10 + + # Headers Configuration + DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "application/json, text/javascript, */*; q=0.01", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Cache-Control": "max-age=0", + } + + # Translation Configuration + CHINESE_TO_ENGLISH_TRANSLATIONS = { + "已获学分": "Obtained Credit", + "已提交学分": "Credits Submitted", + "学分": "Credit", + "先修": "Prerequisite", + "课程": "Course", + "或": "or", + "且": "and", + "以上": "above", + "学期": "Semester", + "年级": "Grade", + } + + +# Legacy constants for backward compatibility +BASE_URL = CrawlerConfig.COURSESEL_BASE_URL +COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") +OFFICIAL_BASE_URL = CrawlerConfig.OFFICIAL_BASE_URL +OFFICIAL_ORC_BASE_URL = CrawlerConfig.OFFICIAL_ORC_BASE_URL +OFFICIAL_COURSE_DETAIL_URL_PREFIX = CrawlerConfig.OFFICIAL_COURSE_DETAIL_URL_PREFIX OFFICIAL_UNDERGRAD_URL = OFFICIAL_ORC_BASE_URL -class CourseSelCrawler: +class CourseSelAPICrawler: """ - JI SJTU Course Selection System Crawler + Course Selection System API Crawler - Crawls course data from three APIs: - 1. Lesson tasks API: course offerings and basic info - 2. Course catalog API: detailed descriptions - 3. Prerequisites API: prerequisite rules + Handles authentication and data retrieval from the JI SJTU course selection system. + Supports three main APIs: lesson tasks, course catalog, and prerequisites. """ def __init__(self, jsessionid=None): - """Initialize crawler with session and authentication""" + """ + Initialize the Course Selection API crawler + + Args: + jsessionid (str, optional): Session ID for authentication + """ self.session = requests.Session() self.jsessionid = jsessionid self._initialized = False - logger.info("Crawler created (not yet initialized)") + logger.info("CourseSelAPICrawler created") def _ensure_initialized(self): """Ensure crawler is properly initialized with authentication""" @@ -68,92 +132,68 @@ def _ensure_initialized(self): cookies = {"JSESSIONID": self.jsessionid} self.session.cookies.update(cookies) - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Accept": "application/json, text/javascript, */*; q=0.01", - "Referer": BASE_URL, - "X-Requested-With": "XMLHttpRequest", - } + headers = CrawlerConfig.DEFAULT_HEADERS.copy() + headers.update( + { + "Referer": CrawlerConfig.COURSESEL_BASE_URL, + "X-Requested-With": "XMLHttpRequest", + } + ) self.session.headers.update(headers) self._initialized = True - logger.info("Crawler initialized successfully!") + logger.info("CourseSelAPICrawler initialized successfully") - def get_all_course_data(self, include_coursesel=True, include_official=True): + def set_session_id(self, jsessionid): """ - Get all course data from multiple APIs and official website - Pure data extraction without user interaction + Set or update the session ID Args: - include_coursesel: Whether to include course selection system data - include_official: Whether to include official website data + jsessionid (str): New session ID + """ + self.jsessionid = jsessionid + self._initialized = False + self._ensure_initialized() + + def crawl_all_apis(self, apis=None): + """ + Crawl data from all or specified APIs + + Args: + apis (list, optional): List of API names to crawl. + If None, crawl all APIs. Returns: - list: Course data with prerequisites, descriptions, and instructors + dict: Dictionary containing data from each API """ - courses_data = [] - course_details = {} - prerequisites = {} - official_data = {} - - if include_coursesel: - self._ensure_initialized() # Make sure crawler is initialized - logger.info("Crawling course selection system data...") - # Get data from course selection APIs - courses_data = self._get_lesson_tasks() - course_details = self._get_course_catalog() - prerequisites = self._get_prerequisites() - else: - logger.info("Skipping course selection system data") - - if include_official: - logger.info("Crawling official website data...") - # Get official website data for enhanced descriptions - official_data = self._get_official_website_data() - else: - logger.info("Skipping official website data") - - # Integrate data - integrated_data = self._integrate_course_data( - courses_data, course_details, prerequisites, official_data - ) + if apis is None: + apis = CrawlerConfig.COURSESEL_APIS - logger.info( - f"Integrated data count: {len(integrated_data) if integrated_data else 0}" - ) + self._ensure_initialized() - return integrated_data + results = {} - def _get_current_elect_turn_id(self): - """Get current election turn ID dynamically""" - url = f"{BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action" + if "lesson_tasks" in apis: + results["lesson_tasks"] = self.crawl_lesson_tasks() - try: - response = self.session.get(url, params={"_t": int(time.time() * 1000)}) - response.raise_for_status() - data = response.json() + if "course_catalog" in apis: + results["course_catalog"] = self.crawl_course_catalog() - if data and isinstance(data, list) and len(data) > 0: - # Get the first (current) election turn - current_turn = data[0] - elect_turn_id = current_turn.get("electTurnId") - if elect_turn_id: - logger.debug(f"Found current electTurnId: {elect_turn_id}") - return elect_turn_id + if "prerequisites" in apis: + results["prerequisites"] = self.crawl_prerequisites() - logger.error("Could not find current electTurnId in API response") - raise ValueError( - "Unable to get current electTurnId - API returned no valid election turns" - ) - except Exception as e: - logger.error(f"Error getting electTurnId: {e}") - raise RuntimeError(f"Failed to retrieve electTurnId from API: {e}") from e + return results + + def crawl_lesson_tasks(self): + """ + Crawl lesson task data from the course selection API - def _get_lesson_tasks(self): - """Get lesson task data from course selection API""" - url = f"{BASE_URL}/tpm/findLessonTasksPreview_ElectTurn.action" + Returns: + list: List of lesson task records + """ + self._ensure_initialized() - # Get current election turn ID dynamically + url = f"{CrawlerConfig.COURSESEL_BASE_URL}/tpm/findLessonTasksPreview_ElectTurn.action" elect_turn_id = self._get_current_elect_turn_id() json_params = { @@ -179,19 +219,27 @@ def _get_lesson_tasks(self): full_url = f"{url}?jsonString={encoded_json}" try: - response = self.session.get(full_url, timeout=30) + response = self.session.get(full_url, timeout=CrawlerConfig.REQUEST_TIMEOUT) response.raise_for_status() data = response.json() if data.get("success") and "data" in data and "lessonTasks" in data["data"]: return data["data"]["lessonTasks"] return [] - except Exception: + except Exception as e: + logger.error(f"Error crawling lesson tasks: {e}") return [] - def _get_course_catalog(self): - """Get course catalog data with detailed descriptions""" - url = f"{BASE_URL}/jdji/tpm/findOwnCollegeCourse_JiCourse.action" + def crawl_course_catalog(self): + """ + Crawl course catalog data with detailed descriptions + + Returns: + dict: Dictionary mapping course IDs to course information + """ + self._ensure_initialized() + + url = f"{CrawlerConfig.COURSESEL_BASE_URL}/jdji/tpm/findOwnCollegeCourse_JiCourse.action" try: response = self.session.post(url, json={}) @@ -205,22 +253,26 @@ def _get_course_catalog(self): return {} return {course.get("courseId"): course for course in courses} return {} - except Exception: + except Exception as e: + logger.error(f"Error crawling course catalog: {e}") return {} - def _get_prerequisites(self): - """Get prerequisite data with course requirements and logic""" - url = f"{BASE_URL}/tpm/findAll_PrerequisiteCourse.action" + def crawl_prerequisites(self): + """ + Crawl prerequisite data with course requirements and logic + + Returns: + dict: Dictionary mapping course IDs to prerequisite lists + """ + self._ensure_initialized() + + url = ( + f"{CrawlerConfig.COURSESEL_BASE_URL}/tpm/findAll_PrerequisiteCourse.action" + ) try: logger.info(f"Requesting Prerequisites API: {url}") - logger.info(f"Session cookies: {dict(self.session.cookies)}") response = self.session.post(url, json={}) - logger.info(f"Response status: {response.status_code}") - logger.info(f"Response headers: {dict(response.headers)}") - logger.info(f"Response content length: {len(response.content)}") - logger.info(f"Response content (first 500 chars): {response.text[:500]}") - response.raise_for_status() if not response.text.strip(): @@ -229,18 +281,10 @@ def _get_prerequisites(self): data = response.json() - logger.debug(f"Prerequisites API response: success={data.get('success')}") - logger.debug( - f"Data keys: {list(data.keys()) if isinstance(data, dict) else 'Not dict'}" - ) - if data.get("success") and "data" in data: raw_prereqs = data["data"] logger.debug(f"Raw prerequisites data: {len(raw_prereqs)} items") - if raw_prereqs and len(raw_prereqs) > 0: - logger.debug(f"First prerequisite item: {raw_prereqs[0]}") - prereqs = defaultdict(list) for item in raw_prereqs: course_id = item.get("courseId") @@ -250,7 +294,7 @@ def _get_prerequisites(self): logger.debug( f"Grouped prerequisites: {len(prereqs)} course IDs have prereqs" ) - return prereqs + return dict(prereqs) else: logger.warning("Prerequisites API failed or no data") return {} @@ -258,49 +302,103 @@ def _get_prerequisites(self): logger.error(f"Prerequisites API error: {str(e)}") return {} - def _get_official_website_data(self): - """Get course data from official JI website for enhanced descriptions""" + def _get_current_elect_turn_id(self): + """ + Get current election turn ID dynamically + + Returns: + str: Current election turn ID + + Raises: + RuntimeError: If unable to retrieve election turn ID + """ + url = f"{CrawlerConfig.COURSESEL_BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action" + + try: + response = self.session.get(url) + response.raise_for_status() + data = response.json() + + if data and isinstance(data, list) and len(data) > 0: + current_turn = data[0] + elect_turn_id = current_turn.get("electTurnId") + if elect_turn_id: + logger.debug(f"Found current electTurnId: {elect_turn_id}") + return elect_turn_id + + logger.error("Could not find current electTurnId in API response") + raise ValueError( + "Unable to get current electTurnId - API returned no valid election turns" + ) + except Exception as e: + logger.error(f"Error getting electTurnId: {e}") + raise RuntimeError(f"Failed to retrieve electTurnId from API: {e}") from e + + +class OfficialWebsiteCrawler: + """ + Official JI Website Crawler + + Handles asynchronous crawling of course data from the official JI SJTU website. + Provides detailed course descriptions and additional course information. + """ + + def __init__(self): + """Initialize the Official Website crawler""" + logger.info("OfficialWebsiteCrawler created") + + async def crawl_official_data(self): + """ + Crawl course data from the official JI website + + Returns: + dict: Dictionary mapping course codes to course information + """ logger.info("Fetching course data from official website") try: - # Run the async crawler - return asyncio.run(self._get_official_website_data_async()) + return await self._crawl_official_data_async() except Exception as e: logger.error(f"Error fetching official website data: {str(e)}") return {} - async def _get_official_website_data_async(self): - """Optimized async version of official website data fetching with enhanced concurrency""" + async def _crawl_official_data_async(self): + """ + Optimized async version of official website data fetching with enhanced concurrency + + Returns: + dict: Dictionary mapping course codes to course information + """ # Get all course URLs from official website official_urls = self._get_official_course_urls() - print(f"DEBUG: Found {len(official_urls)} official course URLs") + logger.info(f"Found {len(official_urls)} official course URLs") if not official_urls: logger.warning("No official course URLs found") return {} - logger.info(f"Found {len(official_urls)} course URLs to crawl") - - # Optimized timeout and session settings for maximum speed - timeout = aiohttp.ClientTimeout(total=15, connect=3, sock_read=10) - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - "Cache-Control": "max-age=0", - } + # Configure optimized timeout and session settings + timeout = aiohttp.ClientTimeout( + total=CrawlerConfig.REQUEST_TIMEOUT, + connect=CrawlerConfig.CONNECT_TIMEOUT, + sock_read=CrawlerConfig.READ_TIMEOUT, + ) + + headers = CrawlerConfig.DEFAULT_HEADERS.copy() + headers.update( + { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" + } + ) # Create connector with balanced performance and stability settings connector = aiohttp.TCPConnector( - limit=50, # Reduced total connection pool size - limit_per_host=25, # Reduced connections per host - ttl_dns_cache=600, # Longer DNS cache for 10 minutes + limit=CrawlerConfig.MAX_CONNECTIONS, + limit_per_host=CrawlerConfig.CONNECTIONS_PER_HOST, + ttl_dns_cache=CrawlerConfig.DNS_CACHE_TTL, use_dns_cache=True, - keepalive_timeout=30, # Shorter keepalive to prevent hangs + keepalive_timeout=CrawlerConfig.KEEPALIVE_TIMEOUT, enable_cleanup_closed=True, ) @@ -308,46 +406,29 @@ async def _get_official_website_data_async(self): timeout=timeout, headers=headers, connector=connector, - read_bufsize=32768, # Smaller read buffer for stability + read_bufsize=CrawlerConfig.READ_BUFFER_SIZE, ) as session: # Balanced concurrent requests for stability - semaphore = asyncio.Semaphore(20) # Reduced to 20 concurrent requests + semaphore = asyncio.Semaphore(CrawlerConfig.CONCURRENT_REQUESTS) # Create tasks for all URLs tasks = [ - self._crawl_official_course_data_async(session, semaphore, url) + self._crawl_single_course_async(session, semaphore, url) for url in official_urls ] - # Execute all tasks concurrently with better progress tracking + # Execute all tasks concurrently official_data = {} - completed = 0 - total = len(tasks) - - # Use gather for better performance than as_completed try: - print( - f"DEBUG: Starting to crawl {total} URLs with 20 concurrent requests..." + logger.info( + f"Starting to crawl {len(tasks)} URLs with {CrawlerConfig.CONCURRENT_REQUESTS} concurrent requests" ) results = await asyncio.gather(*tasks, return_exceptions=True) - print(f"DEBUG: Received {len(results)} results") - successful = 0 failed = 0 for i, result in enumerate(results): - completed += 1 - - # More frequent progress reporting for better visibility - if completed % 20 == 0 or completed == total: - print( - f"DEBUG: Progress: {completed}/{total} ({completed / total * 100:.1f}%) - Success: {successful}, Failed: {failed}" - ) - logger.info( - f"Progress: {completed}/{total} courses processed ({completed / total * 100:.1f}%)" - ) - if isinstance(result, Exception): failed += 1 logger.warning(f"Failed to crawl course {i + 1}: {str(result)}") @@ -363,29 +444,32 @@ async def _get_official_website_data_async(self): else: failed += 1 + logger.info( + f"Successfully extracted {len(official_data)} courses from {len(tasks)} URLs" + ) + logger.info(f"Final stats - Success: {successful}, Failed: {failed}") + except Exception as e: - print(f"DEBUG: Batch crawling failed: {str(e)}") logger.error(f"Batch crawling failed: {str(e)}") return {} - print( - f"DEBUG: Successfully extracted {len(official_data)} courses from {total} URLs" - ) - print(f"DEBUG: Final stats - Success: {successful}, Failed: {failed}") - logger.info( - f"Successfully fetched official data for {len(official_data)} courses out of {total} total" - ) return official_data - async def _crawl_official_course_data_async(self, session, semaphore, course_url): - """Ultra-optimized async crawl single course data from official website""" - async with semaphore: # Limit concurrent requests - try: - # Balanced retry logic for stability - max_retries = 2 # Increased retries for stability - retry_delay = 1.0 # Longer retry delay + async def _crawl_single_course_async(self, session, semaphore, course_url): + """ + Crawl single course data from official website asynchronously + + Args: + session: aiohttp session + semaphore: asyncio semaphore for concurrency control + course_url (str): URL of the course page to crawl - for attempt in range(max_retries + 1): + Returns: + dict or None: Course data or None if failed + """ + async with semaphore: + try: + for attempt in range(CrawlerConfig.MAX_RETRIES + 1): try: async with session.get(course_url) as response: if response.status == 200: @@ -393,27 +477,23 @@ async def _crawl_official_course_data_async(self, session, semaphore, course_url return self._parse_official_course_html( html_content, course_url ) - elif response.status in [ - 429, - 503, - 502, - 504, - ]: # Retry on server errors - if attempt < max_retries: + elif response.status in [429, 503, 502, 504]: + if attempt < CrawlerConfig.MAX_RETRIES: await asyncio.sleep( - retry_delay * (attempt + 1) - ) # Exponential backoff + CrawlerConfig.RETRY_DELAY * (attempt + 1) + ) continue - # Don't retry on other errors, fail fast return None except (aiohttp.ClientError, asyncio.TimeoutError) as e: - if attempt < max_retries: - await asyncio.sleep(retry_delay * (attempt + 1)) + if attempt < CrawlerConfig.MAX_RETRIES: + await asyncio.sleep( + CrawlerConfig.RETRY_DELAY * (attempt + 1) + ) continue else: logger.debug( - f"Failed to fetch {course_url} after {max_retries + 1} attempts: {str(e)}" + f"Failed to fetch {course_url} after {CrawlerConfig.MAX_RETRIES + 1} attempts: {str(e)}" ) return None @@ -424,193 +504,307 @@ async def _crawl_official_course_data_async(self, session, semaphore, course_url return None def _parse_official_course_html(self, html_content, course_url): - """Ultra-fast HTML parsing for course data extraction""" + """ + Parse HTML content to extract course data + + Args: + html_content (str): HTML content of the course page + course_url (str): URL of the course page + + Returns: + dict or None: Parsed course data or None if failed + """ try: from bs4 import BeautifulSoup - # Use faster lxml parser for better performance soup = BeautifulSoup(html_content, "lxml") course_heading = soup.find("h2") if not course_heading: - print(f"DEBUG: No h2 heading found in {course_url}") return None course_heading_text = course_heading.get_text() if not course_heading_text: - print(f"DEBUG: Empty h2 text in {course_url}") return None - split_course_heading = course_heading_text.split(" – ") # Using em dash + split_course_heading = course_heading_text.split(" – ") if len(split_course_heading) < 2: - print( - f"DEBUG: Invalid heading format '{course_heading_text}' in {course_url}" - ) return None - # Fast extraction with minimal processing text_inner_sections = soup.find_all(class_="et_pb_text_inner") if len(text_inner_sections) < 4: - print( - f"DEBUG: Insufficient text sections ({len(text_inner_sections)}) in {course_url}" - ) return None course_code = split_course_heading[0] course_title = split_course_heading[1] - print(f"DEBUG: Successfully parsed {course_code} - {course_title}") + # Extract course information including instructors + content_section = text_inner_sections[3] - # Fast description and topics extraction description = "" course_topics = [] + instructors = [] - # Get all text content at once for faster processing - content_section = text_inner_sections[3] + # Get all text content and parse it all_text = content_section.get_text(separator="\n", strip=True) - - # Simple text processing for speed lines = [line.strip() for line in all_text.split("\n") if line.strip()] - in_description = False - in_topics = False - + current_section = None for i, line in enumerate(lines): if "Description:" in line: - in_description = True + current_section = "description" continue elif "Course Topics:" in line or "Course topics:" in line: - in_description = False - in_topics = True + current_section = "topics" continue - elif ( - in_description - and line - and not any( - x in line for x in ["Course Topics", "Lectures", "Seminars"] - ) - ): - if description: - description += " " + line - else: - description = line - elif in_topics and line: - # Simple topic extraction - clean_line = line.lstrip("-*").strip() - if clean_line and len(course_topics) < 10: # Limit for performance + elif "Instructors:" in line: + current_section = "instructors" + continue + elif "Credits:" in line or "Pre-requisites:" in line: + # Stop parsing when we hit credits or prerequisites + current_section = None + continue + elif line and current_section == "description": + if not any( + x in line + for x in [ + "Course Topics", + "Instructors", + "Credits", + "Pre-requisites", + ] + ): + description = f"{description} {line}".strip() + elif line and current_section == "topics": + clean_line = line.lstrip("-*•").strip() + if ( + clean_line + and len(course_topics) < CrawlerConfig.MAX_COURSE_TOPICS + ): course_topics.append(clean_line) + elif line and current_section == "instructors": + # Stop if we encounter credits or prerequisites + if any( + keyword in line for keyword in ["Credits:", "Pre-requisites:"] + ): + break + + # Parse instructors - they might be separated by semicolons or commas + instructor_names = [] + for separator in [";", ","]: + if separator in line: + instructor_names = [ + name.strip() + for name in line.split(separator) + if name.strip() + ] + break + else: + instructor_names = [line.strip()] if line.strip() else [] + + instructors.extend(instructor_names) return { "course_code": course_code, "course_title": course_title, "description": description.strip(), "course_topics": course_topics, + "instructors": instructors, "official_url": course_url, } except Exception: - # Fast fail for maximum performance return None def _get_official_course_urls(self): - """Get all course URLs from official website""" + """ + Get all course URLs from official website + + Returns: + set: Set of course URLs + """ try: from bs4 import Tag - print(f"DEBUG: Fetching course URLs from {OFFICIAL_UNDERGRAD_URL}") - soup = retrieve_soup(OFFICIAL_UNDERGRAD_URL) + soup = retrieve_soup(CrawlerConfig.OFFICIAL_ORC_BASE_URL) if not soup: - print("DEBUG: Failed to retrieve soup from official website") + logger.error("Failed to retrieve soup from official website") return set() linked_urls = [] for a in soup.find_all("a", href=True): - # Check if it's a Tag element and has href attribute if isinstance(a, Tag) and a.has_attr("href"): href = a["href"] if href and isinstance(href, str): - full_url = urljoin(OFFICIAL_BASE_URL, href) + full_url = urljoin(CrawlerConfig.OFFICIAL_BASE_URL, href) linked_urls.append(full_url) - print(f"DEBUG: Found {len(linked_urls)} total links") - course_urls = { linked_url for linked_url in linked_urls if self._is_official_course_url(linked_url) } - print(f"DEBUG: Filtered to {len(course_urls)} course URLs") - if len(course_urls) > 0: - print(f"DEBUG: Sample course URL: {list(course_urls)[0]}") - + logger.info(f"Found {len(course_urls)} course URLs from official website") return course_urls except Exception as e: - print(f"DEBUG: Error getting official course URLs: {str(e)}") logger.error(f"Error getting official course URLs: {str(e)}") return set() def _is_official_course_url(self, candidate_url): - """Check if URL is a valid official course detail URL""" - return candidate_url.startswith(OFFICIAL_COURSE_DETAIL_URL_PREFIX) + """ + Check if URL is a valid official course detail URL + + Args: + candidate_url (str): URL to validate - def _integrate_course_data( + Returns: + bool: True if URL is a course detail page + """ + return candidate_url.startswith(CrawlerConfig.OFFICIAL_COURSE_DETAIL_URL_PREFIX) + + +class CourseDataIntegrator: + """ + Course Data Integrator + + Handles integration of course data from multiple sources: + - Course Selection System APIs + - Official Website data + - Data normalization and standardization + """ + + def __init__(self): + """Initialize the Course Data Integrator""" + logger.info("CourseDataIntegrator created") + + def integrate_data( self, lesson_tasks_data, course_catalog_data, prerequisites_data, official_data=None, ): - """Integrate course data with course catalog as primary source""" + """ + Integrate course data from multiple sources with course catalog as primary source + + Args: + lesson_tasks_data (list): Lesson task data from course selection API + course_catalog_data (dict): Course catalog data from course selection API + prerequisites_data (dict): Prerequisites data from course selection API + official_data (dict, optional): Official website data + + Returns: + list: List of integrated course records + """ if official_data is None: official_data = {} logger.info( - f"Starting integration with {len(lesson_tasks_data)} lesson tasks, {len(course_catalog_data)} catalog courses, {len(prerequisites_data)} prereq groups, {len(official_data)} official records" + f"Starting integration with {len(lesson_tasks_data)} lesson tasks, " + f"{len(course_catalog_data)} catalog courses, {len(prerequisites_data)} prereq groups, " + f"{len(official_data)} official records" ) integrated_courses = [] courses_with_prereqs = 0 - # Create index of lesson tasks by course code + # Create indexes for efficient lookup + lesson_tasks_by_code = self._create_lesson_tasks_index(lesson_tasks_data) + official_by_code = self._create_official_data_index(official_data) + prereq_by_code = self._create_prerequisites_index( + prerequisites_data, course_catalog_data + ) + + # Get all unique course codes + all_course_codes = self._get_all_course_codes( + course_catalog_data, official_by_code + ) + + logger.info(f"Processing {len(all_course_codes)} unique course codes") + + # Process each course code + for course_code in all_course_codes: + catalog_info = self._find_catalog_info(course_code, course_catalog_data) + lesson_tasks_list = lesson_tasks_by_code.get(course_code, []) + official_info = official_by_code.get(course_code, {}) + prereq_info = prereq_by_code.get(course_code, []) + + # Merge lesson tasks sections if available + merged_lesson_tasks = ( + self._merge_course_sections(lesson_tasks_list) + if lesson_tasks_list + else {} + ) + + if prereq_info: + courses_with_prereqs += 1 + logger.debug( + f"Course {course_code} has {len(prereq_info)} prerequisites" + ) + + # Build course record + course_data = self._build_course_record( + course_code, + merged_lesson_tasks, + catalog_info, + prereq_info, + official_info, + ) + + if course_data: + integrated_courses.append(course_data) + + logger.info( + f"Integration complete: {courses_with_prereqs} courses have prerequisites, " + f"{len(integrated_courses)} total courses" + ) + return integrated_courses + + def _create_lesson_tasks_index(self, lesson_tasks_data): + """Create index of lesson tasks by course code""" lesson_tasks_by_code = defaultdict(list) if lesson_tasks_data: for course in lesson_tasks_data: course_code = course.get("courseCode") if course_code: lesson_tasks_by_code[course_code].append(course) + return lesson_tasks_by_code - # Create index of official data by course code + def _create_official_data_index(self, official_data): + """Create index of official data by course code""" official_by_code = {} if official_data: for course_code, official_info in official_data.items(): official_by_code[course_code] = official_info + return official_by_code - # Create index of prerequisites by course code + def _create_prerequisites_index(self, prerequisites_data, course_catalog_data): + """Create index of prerequisites by course code""" prereq_by_code = {} if prerequisites_data: for course_id, prereq_list in prerequisites_data.items(): - # Find course code for this course_id from catalog if isinstance(course_catalog_data, dict): catalog_info = course_catalog_data.get(course_id) if catalog_info: course_code = catalog_info.get("courseCode") if course_code: prereq_by_code[course_code] = prereq_list + return prereq_by_code - # Get all course codes from course catalog (primary source) + def _get_all_course_codes(self, course_catalog_data, official_by_code): + """Get all unique course codes from all sources""" all_course_codes = set() + if isinstance(course_catalog_data, dict): - # If course_catalog_data is dict {courseId: course_info} for course_id, catalog_info in course_catalog_data.items(): course_code = catalog_info.get("courseCode") if course_code: all_course_codes.add(course_code) elif isinstance(course_catalog_data, list): - # If course_catalog_data is list [course_info, ...] for catalog_info in course_catalog_data: course_code = catalog_info.get("courseCode") if course_code: @@ -619,58 +813,33 @@ def _integrate_course_data( # Add course codes that only exist in official data all_course_codes.update(official_by_code.keys()) - logger.info(f"Processing {len(all_course_codes)} unique course codes") - - # Process each course code - for course_code in all_course_codes: - # Get catalog info for this course code - catalog_info = {} - if isinstance(course_catalog_data, dict): - for course_id, info in course_catalog_data.items(): - if info.get("courseCode") == course_code: - catalog_info = info - break - elif isinstance(course_catalog_data, list): - for info in course_catalog_data: - if info.get("courseCode") == course_code: - catalog_info = info - break - - # Get data from other sources - lesson_tasks_list = lesson_tasks_by_code.get(course_code, []) - official_info = official_by_code.get(course_code, {}) - prereq_info = prereq_by_code.get(course_code, []) - - # Merge lesson tasks sections if available - merged_lesson_tasks = {} - if lesson_tasks_list: - merged_lesson_tasks = self._merge_course_sections(lesson_tasks_list) + return all_course_codes - if prereq_info: - courses_with_prereqs += 1 - logger.debug( - f"Course {course_code} has {len(prereq_info)} prerequisites" - ) - - # Build course record (catalog data as base, supplemented by others) - course_data = self._build_course_record( - course_code, - merged_lesson_tasks, - catalog_info, - prereq_info, - official_info, - ) + def _find_catalog_info(self, course_code, course_catalog_data): + """Find catalog information for a specific course code""" + catalog_info = {} + if isinstance(course_catalog_data, dict): + for course_id, info in course_catalog_data.items(): + if info.get("courseCode") == course_code: + catalog_info = info + break + elif isinstance(course_catalog_data, list): + for info in course_catalog_data: + if info.get("courseCode") == course_code: + catalog_info = info + break + return catalog_info - if course_data: - integrated_courses.append(course_data) + def _merge_course_sections(self, course_list): + """ + Merge sections of the same course - logger.info( - f"Integration complete: {courses_with_prereqs} courses have prerequisites, {len(integrated_courses)} total courses" - ) - return integrated_courses + Args: + course_list (list): List of course sections - def _merge_course_sections(self, course_list): - """Merge sections of the same course""" + Returns: + dict: Merged course data + """ if not course_list: return {} @@ -690,7 +859,19 @@ def _merge_course_sections(self, course_list): def _build_course_record( self, course_code, main_data, catalog_data, prereq_data, official_data=None ): - """Build standardized course record with official website data as primary source""" + """ + Build standardized course record from multiple data sources + + Args: + course_code (str): Course code + main_data (dict): Main course data (lesson tasks) + catalog_data (dict): Catalog course data + prereq_data (list): Prerequisites data + official_data (dict, optional): Official website data + + Returns: + dict or None: Standardized course record or None if invalid + """ if official_data is None: official_data = {} @@ -705,13 +886,12 @@ def _build_course_record( prerequisites = self._build_prerequisites_string(course_code, prereq_data) # Description and topics only from official data - # If course only exists in course selection system, these will be empty description = self._extract_description(official_data) if official_data else "" course_topics = official_data.get("course_topics", []) if official_data else [] official_url = official_data.get("official_url", "") if official_data else "" - # Instructors from course selection data (more current) - instructors = self._extract_instructors(main_data, catalog_data) + # Instructors from multiple sources with priority: lesson tasks > catalog > official + instructors = self._extract_instructors(main_data, catalog_data, official_data) # Use official URL as primary URL, fallback to API URL if not available course_url = official_url or self._build_course_url(main_data) @@ -723,15 +903,25 @@ def _build_course_record( "number": number, "course_credits": course_credits, "pre_requisites": prerequisites, - "description": description, # Empty if only coursesel data - "course_topics": course_topics, # Empty if only coursesel data + "description": description, + "course_topics": course_topics, "instructors": instructors, "url": course_url, - "official_url": official_url, # Empty if only coursesel data + "official_url": official_url, } def _extract_course_title(self, main_data, catalog_data, official_data=None): - """Extract course title (prefer English name)""" + """ + Extract course title with preference for English names + + Args: + main_data (dict): Main course data + catalog_data (dict): Catalog course data + official_data (dict, optional): Official website data + + Returns: + str: Course title + """ if official_data is None: official_data = {} if main_data is None: @@ -748,11 +938,19 @@ def _extract_course_title(self, main_data, catalog_data, official_data=None): ).strip() def _parse_course_code(self, course_code): + """ + Parse course code to extract department and number + + Args: + course_code (str): Course code to parse + + Returns: + tuple: (department, number) where department is str and number is int + """ department = "" number = 0 if course_code: - # Convert to uppercase for consistent matching code_upper = course_code.upper() # Try standard format first: DEPT###(#)?J? (3 or 4 digits, J is optional) @@ -761,8 +959,7 @@ def _parse_course_code(self, course_code): department = match.group(1) number = int(match.group(2)) else: - # Try alternative formats for course codes that don't follow standard pattern - # Format: Letter(s) + Numbers (e.g., C032710, F034546, X413515) + # Try alternative formats for non-standard course codes alt_match = re.match(r"^([A-Z]+)(\d+)$", code_upper) if alt_match: department = alt_match.group(1) @@ -783,7 +980,16 @@ def _parse_course_code(self, course_code): return department, number def _extract_course_credits(self, main_data, catalog_data): - """Extract course credits""" + """ + Extract course credits from available data sources + + Args: + main_data (dict): Main course data + catalog_data (dict): Catalog course data + + Returns: + int: Course credits + """ if main_data is None: main_data = {} if catalog_data is None: @@ -802,7 +1008,16 @@ def _extract_course_credits(self, main_data, catalog_data): return course_credits def _build_prerequisites_string(self, course_code, prereq_data): - """Build prerequisites string from API data""" + """ + Build prerequisites string from API data + + Args: + course_code (str): Course code + prereq_data (list): Prerequisites data + + Returns: + str: Formatted prerequisites string + """ if not prereq_data: return "" @@ -828,49 +1043,65 @@ def _build_prerequisites_string(self, course_code, prereq_data): return "" def _normalize_prerequisites_to_english(self, prerequisites_text): - """Convert Chinese prerequisite terms to English""" + """ + Convert Chinese prerequisite terms to English + + Args: + prerequisites_text (str): Prerequisites text with potential Chinese terms + + Returns: + str: Normalized text with English terms + """ if not prerequisites_text: return "" - # Dictionary for Chinese to English translations - translations = { - "已获学分": "Obtained Credit", - "已提交学分": "Credits Submitted", - "学分": "Credit", - "先修": "Prerequisite", - "课程": "Course", - "或": "or", - "且": "and", - "以上": "above", - "学期": "Semester", - "年级": "Grade", - } - - # Apply translations normalized_text = prerequisites_text - for chinese_term, english_term in translations.items(): + for ( + chinese_term, + english_term, + ) in CrawlerConfig.CHINESE_TO_ENGLISH_TRANSLATIONS.items(): normalized_text = normalized_text.replace(chinese_term, english_term) return normalized_text def _extract_description(self, official_data=None): - """Extract course description (only from official website)""" + """ + Extract course description from official website data + + Args: + official_data (dict, optional): Official website data + + Returns: + str: Course description + """ if official_data is None: official_data = {} return official_data.get("description", "").strip() - def _extract_instructors(self, main_data, catalog_data): - """Extract and merge instructor information""" + def _extract_instructors(self, main_data, catalog_data, official_data=None): + """ + Extract and merge instructor information from multiple sources + Priority: 1) lesson tasks, 2) catalog data, 3) official website + + Args: + main_data (dict): Main course data + catalog_data (dict): Catalog course data + official_data (dict): Official website course data + + Returns: + list: List of instructor names + """ if main_data is None: main_data = {} if catalog_data is None: catalog_data = {} + if official_data is None: + official_data = {} instructors = [] - # Extract from lesson tasks data (main_data) - # Check for lessonTaskTeam field (string format) + # Priority 1: Extract from lesson tasks data lesson_task_team = main_data.get("lessonTaskTeam", "").strip() if lesson_task_team: instructors.append(lesson_task_team) @@ -887,7 +1118,7 @@ def _extract_instructors(self, main_data, catalog_data): if instructor.strip() and instructor.strip() not in instructors: instructors.append(instructor.strip()) - # Extract from catalog data (teacherName field) + # Priority 2: Extract from catalog data (teacherName field) teacher_name = catalog_data.get("teacherName", "").strip() if teacher_name: for teacher in re.split(r"[,;]", teacher_name): @@ -895,143 +1126,27 @@ def _extract_instructors(self, main_data, catalog_data): if teacher and teacher not in instructors: instructors.append(teacher) + # Priority 3: Extract from official website data (fallback) + if not instructors: # Only use if no instructors found from other sources + official_instructors = official_data.get("instructors", []) + if isinstance(official_instructors, list): + for instructor in official_instructors: + if instructor.strip() and instructor.strip() not in instructors: + instructors.append(instructor.strip()) + return instructors def _build_course_url(self, main_data): - """Build course detail page URL""" + """ + Build course detail page URL + + Args: + main_data (dict): Main course data + + Returns: + str: Course detail URL or empty string + """ if main_data is None: main_data = {} course_id = main_data.get("courseId") return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" - - -_crawler = None -_course_data_cache = {} - - -def _get_crawler(): - """Get crawler instance (singleton pattern)""" - global _crawler - if _crawler is None: - _crawler = CourseSelCrawler() - return _crawler - - -def crawl_program_urls(): - """Get all course URLs (legacy interface)""" - global _course_data_cache - - crawler = _get_crawler() - courses = crawler.get_all_course_data() - - course_urls = [] - _course_data_cache = {} # Reset cache - - for course in courses: - if course.get("url"): - course_urls.append(course["url"]) - _course_data_cache[course["url"]] = course - - return course_urls - - -def _crawl_course_data(course_url): - """Crawl single course data (legacy interface)""" - global _course_data_cache - - course_data = _course_data_cache.get(course_url) - if course_data: - return course_data - - return {} - - -def import_department(department_data): - """Import course data to database with improved error handling""" - success_count = 0 - error_count = 0 - - for course_data in department_data: - try: - # Validate required fields - required_fields = ["course_code", "course_title"] - missing_fields = [ - field for field in required_fields if not course_data.get(field) - ] - - if missing_fields: - logger.warning( - f"Skipping course due to missing required fields: {missing_fields}" - ) - error_count += 1 - continue - - # Prepare default values, handle potentially missing fields - defaults = { - "course_title": course_data.get("course_title", ""), - "department": course_data.get("department", ""), - "number": course_data.get("number", 0), - "course_credits": course_data.get("course_credits", 0), - "pre_requisites": course_data.get("pre_requisites", ""), - "description": course_data.get("description", ""), - "course_topics": course_data.get("course_topics", []), - "url": course_data.get("url", ""), - } - - # Note: official_url field does not exist in Course model, so it's not included - - # Create or update course - course, created = Course.objects.update_or_create( - course_code=course_data["course_code"], - defaults=defaults, - ) - - # Handle instructor information - instructors = course_data.get("instructors", []) - if instructors: - for instructor_name in instructors: - if instructor_name.strip(): # Ensure instructor name is not empty - try: - instructor, _ = Instructor.objects.get_or_create( - name=instructor_name.strip() - ) - - offering, _ = CourseOffering.objects.get_or_create( - course=course, - term=CURRENT_TERM, - defaults={"section": 1, "period": ""}, - ) - offering.instructors.add(instructor) - except Exception as e: - logger.warning( - f"Error creating instructor {instructor_name}: {str(e)}" - ) - - success_count += 1 - if created: - logger.info(f"Created new course: {course_data['course_code']}") - else: - logger.info(f"Updated course: {course_data['course_code']}") - - except Exception as e: - error_count += 1 - course_code = course_data.get("course_code", "Unknown") - error_msg = str(e) - print(f"Error importing course {course_code}: {error_msg}") - logger.error(f"Error importing course {course_code}: {error_msg}") - - logger.info(f"Import completed: {success_count} successful, {error_count} errors") - return {"success": success_count, "errors": error_count} - - -def extract_prerequisites(pre_requisites): - """Process prerequisite string format (legacy function)""" - result = pre_requisites - - result = result.replace("Pre-requisites:", "").strip() - result = result.replace("Obtained Credit", "obtained_credit").strip() - result = result.replace("Credits Submitted", "credits_submitted").strip() - result = result.replace("&&", " && ").strip() - result = result.replace("||", " || ").strip() - - return result diff --git a/apps/spider/manager.py b/apps/spider/manager.py index e61c0ce..0936d7c 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -7,6 +7,7 @@ import os import sys import django +import asyncio from datetime import datetime from pathlib import Path @@ -17,6 +18,299 @@ django.setup() +class CrawlerManager: + """ + Unified crawler management system + + Coordinates between different crawler components and provides + a clean interface for data extraction and integration. + """ + + def __init__(self, cache_dir=None): + self.cache = CourseDataCache(cache_dir) + self.config = None + self.api_crawler = None + self.website_crawler = None + self.integrator = None + self._initialize_components() + + def _initialize_components(self): + """Initialize crawler components with lazy loading""" + try: + from apps.spider.crawlers.orc import ( + CrawlerConfig, + CourseSelAPICrawler, + OfficialWebsiteCrawler, + CourseDataIntegrator, + ) + + self.config = CrawlerConfig() + self.website_crawler = OfficialWebsiteCrawler() + self.integrator = CourseDataIntegrator() + except ImportError as e: + print(f"Warning: Could not import crawler components: {e}") + + def create_api_crawler(self, jsessionid): + """Create API crawler with authentication""" + try: + from apps.spider.crawlers.orc import CourseSelAPICrawler + + self.api_crawler = CourseSelAPICrawler(jsessionid) + return self.api_crawler + except ImportError as e: + print(f"Error: Could not import CourseSelAPICrawler: {e}") + return None + + def crawl_coursesel_data(self, jsessionid, apis=None): + """ + Crawl data from course selection system + + Args: + jsessionid: Authentication session ID + apis: List of APIs to crawl ('lesson_tasks', 'course_catalog', 'prerequisites') + If None, crawls all APIs + + Returns: + dict: Dictionary with crawled data + """ + if apis is None: + apis = ["lesson_tasks", "course_catalog", "prerequisites"] + + api_crawler = self.create_api_crawler(jsessionid) + if not api_crawler: + return {} + + results = {} + + try: + if "lesson_tasks" in apis: + print("[*] Crawling lesson tasks...") + results["lesson_tasks"] = api_crawler.crawl_lesson_tasks() + print(f"[+] Retrieved {len(results['lesson_tasks'])} lesson tasks") + + if "course_catalog" in apis: + print("[*] Crawling course catalog...") + results["course_catalog"] = api_crawler.crawl_course_catalog() + print( + f"[+] Retrieved {len(results['course_catalog'])} courses from catalog" + ) + + if "prerequisites" in apis: + print("[*] Crawling prerequisites...") + results["prerequisites"] = api_crawler.crawl_prerequisites() + print( + f"[+] Retrieved prerequisites for {len(results['prerequisites'])} courses" + ) + + # Save to cache + if results: + saved_files = self.cache.save_coursesel_data( + results.get("lesson_tasks"), + results.get("course_catalog"), + results.get("prerequisites"), + ) + print(f"[+] Saved {len(saved_files)} cache files") + + return results + + except Exception as e: + print(f"[-] Course selection crawling failed: {e}") + return {} + + def crawl_official_data(self): + """ + Crawl data from official website + + Returns: + dict: Dictionary with official website data + """ + if not self.website_crawler: + print("[-] Official website crawler not available") + return {} + + try: + print("[*] Crawling official website...") + official_data = asyncio.run(self.website_crawler.crawl_official_data()) + print(f"[+] Retrieved {len(official_data)} courses from official website") + + # Convert to list format and save to cache + if official_data: + official_list = [] + for course_code, course_info in official_data.items(): + course_info["course_code"] = course_code + official_list.append(course_info) + + filepath = self.cache.save_to_jsonl(official_list, "official") + print(f"[+] Saved to cache: {Path(filepath).name}") + + return official_data + + except Exception as e: + print(f"[-] Official website crawling failed: {e}") + return {} + + def integrate_and_import_data(self, import_to_db=True): + """ + Integrate cached data and optionally import to database + + Args: + import_to_db: Whether to import integrated data to database + + Returns: + dict: Integration and import results + """ + if not self.integrator: + print("[-] Data integrator not available") + return {} + + # Load cached data + cached_data = self._load_cached_data() + if not any(cached_data.values()): + print("[-] No cached data found to integrate") + return {} + + try: + # Integrate data + print("[*] Integrating data...") + integrated_data = self.integrator.integrate_data( + cached_data["lesson_tasks"], + cached_data["course_catalog"], + cached_data["prerequisites"], + cached_data["official_data"], + ) + + print(f"[+] Integrated {len(integrated_data)} course records") + + # Save integrated data + if integrated_data: + filepath = self.cache.save_to_jsonl(integrated_data, "integrated") + print(f"[+] Saved integrated data: {Path(filepath).name}") + + # Import to database if requested + results = {"integrated_count": len(integrated_data)} + if import_to_db and integrated_data: + print("[*] Importing to database...") + import_results = self._import_to_database(integrated_data) + results.update(import_results) + + return results + + except Exception as e: + print(f"[-] Integration failed: {e}") + return {} + + def _load_cached_data(self): + """Load data from cache files""" + files = self.cache.list_cache_files() + + data = { + "lesson_tasks": [], + "course_catalog": {}, + "prerequisites": {}, + "official_data": {}, + } + + for filepath in files: + filename = filepath.name + file_data = self.cache.load_data_file(filepath) + + if "coursesel_lesson_tasks" in filename: + data["lesson_tasks"] = file_data + elif "coursesel_course_catalog" in filename: + data["course_catalog"] = { + item.get("courseId"): item + for item in file_data + if item.get("courseId") + } + elif "coursesel_prerequisites" in filename: + from collections import defaultdict + + prerequisites = defaultdict(list) + for item in file_data: + course_id = item.get("courseId") + if course_id: + prerequisites[course_id].append(item) + data["prerequisites"] = prerequisites + elif "official" in filename: + for item in file_data: + course_code = item.get("course_code") + if course_code: + data["official_data"][course_code] = item + + return data + + def _import_to_database(self, integrated_data): + """Import integrated data to database""" + try: + # Import the database models + from apps.web.models import Course, CourseOffering, Instructor + from lib.constants import CURRENT_TERM + + success_count = 0 + error_count = 0 + + for course_data in integrated_data: + try: + # Create or update course + course_defaults = { + "course_title": course_data.get("course_title", ""), + "course_credits": course_data.get("course_credits", 0), + "pre_requisites": course_data.get("pre_requisites", ""), + "description": course_data.get("description", ""), + "url": course_data.get("url", ""), + "department": course_data.get("department", ""), + "number": course_data.get("number", 0), + } + + course, _ = Course.objects.update_or_create( + course_code=course_data.get("course_code", ""), + defaults=course_defaults, + ) + + # Handle instructors + instructors = course_data.get("instructors", []) + if instructors: + course_offering, _ = CourseOffering.objects.get_or_create( + course=course, term=CURRENT_TERM + ) + + for instructor_name in instructors: + if instructor_name.strip(): + instructor, _ = Instructor.objects.get_or_create( + name=instructor_name.strip() + ) + course_offering.instructors.add(instructor) + + success_count += 1 + + except Exception as e: + error_count += 1 + print( + f"[-] Error importing {course_data.get('course_code', 'Unknown')}: {e}" + ) + + return { + "success": success_count, + "errors": error_count, + "total": success_count + error_count, + } + + except ImportError as e: + print(f"[-] Database import failed - missing dependencies: {e}") + return { + "success": 0, + "errors": len(integrated_data), + "total": len(integrated_data), + } + except Exception as e: + print(f"[-] Database import failed: {e}") + return { + "success": 0, + "errors": len(integrated_data), + "total": len(integrated_data), + } + + class CourseDataCache: """Course data cache manager""" @@ -285,12 +579,25 @@ def run_crawler(): print("=" * 60) try: - from apps.spider.crawlers.orc import CourseSelCrawler + manager = CrawlerManager() + + # Get JSESSIONID for course selection system + print("Course Selection System requires authentication.") + print("Please enter your JSESSIONID cookie:") + jsessionid = input("JSESSIONID: ").strip() + + if jsessionid: + # Crawl course selection data + manager.crawl_coursesel_data(jsessionid) + print("Course selection crawling completed") - crawler = CourseSelCrawler() - data = crawler.get_all_courses(use_cache=False, save_cache=True) + # Crawl official website data + manager.crawl_official_data() + print("Official website crawling completed") - print(f"Crawler execution completed, collected {len(data)} courses") + # Integrate and import data + results = manager.integrate_and_import_data(import_to_db=True) + print(f"Data integration and import completed: {results}") except Exception as e: print(f"Crawler execution failed: {str(e)}") @@ -345,14 +652,21 @@ def import_from_cache(): if preview_data_before_import(selected_file, limit=10): print("Starting database import...") - from apps.spider.crawlers.orc import import_department + # Use CrawlerManager for database import + manager = CrawlerManager() - # Use batch import and get statistics - result = import_department(data) + # Check if this is integrated data or needs integration + if "integrated" in selected_file.name: + # Direct import of integrated data + result = manager._import_to_database(data) + else: + # Single file data needs integration first + print("Single file detected, integrating with other cached data...") + result = manager.integrate_and_import_data(import_to_db=True) print("\nImport completed!") - print(f"Success: {result['success']} items") - print(f"Failed: {result['errors']} items") + print(f"Success: {result.get('success', 0)} items") + print(f"Failed: {result.get('errors', 0)} items") else: print("Import cancelled") @@ -395,9 +709,6 @@ def clean_cache(): def interactive_spider_manager(): """Interactive spider management system""" - import asyncio - from apps.spider.crawlers.orc import CourseSelCrawler - print("=" * 60) print("Interactive Spider Management System") print("=" * 60) @@ -434,9 +745,6 @@ def interactive_spider_manager(): def crawl_workflow(cache): """Crawling workflow""" - import asyncio - from apps.spider.crawlers.orc import CourseSelCrawler - print("\n" + "=" * 40) print("Data Crawling Workflow") print("=" * 40) @@ -462,9 +770,6 @@ def crawl_workflow(cache): def crawl_coursesel_workflow(cache): """Course selection system crawling workflow""" - import asyncio - from apps.spider.crawlers.orc import CourseSelCrawler - print("\n" + "=" * 40) print("Course Selection System Crawling") print("=" * 40) @@ -514,38 +819,24 @@ def crawl_coursesel_workflow(cache): print(f"\nSelected APIs: {', '.join(sorted(selected_apis))}") - # Initialize crawler with JSESSIONID - crawler = CourseSelCrawler(jsessionid=jsessionid) - crawler._ensure_initialized() + # Map choices to API names + api_mapping = {"1": "lesson_tasks", "2": "course_catalog", "3": "prerequisites"} - lesson_tasks = None - course_catalog = None - prerequisites = None + apis_to_crawl = [ + api_mapping[choice] for choice in selected_apis if choice in api_mapping + ] try: - if "1" in selected_apis: - print("\n[*] Crawling Lesson Tasks...") - lesson_tasks = crawler._get_lesson_tasks() - print(f"[+] Retrieved {len(lesson_tasks)} lesson tasks") - - if "2" in selected_apis: - print("\n[*] Crawling Course Catalog...") - course_catalog = crawler._get_course_catalog() - print(f"[+] Retrieved {len(course_catalog)} courses from catalog") - - if "3" in selected_apis: - print("\n[*] Crawling Prerequisites...") - prerequisites = crawler._get_prerequisites() - print(f"[+] Retrieved prerequisites for {len(prerequisites)} courses") - - # Save data to separate jsonl files - saved_files = cache.save_coursesel_data( - lesson_tasks, course_catalog, prerequisites - ) - - print(f"\n[+] Successfully saved {len(saved_files)} files:") - for data_type, filepath in saved_files.items(): - print(f" - {data_type}: {Path(filepath).name}") + # Use CrawlerManager for organized crawling + manager = CrawlerManager(cache.cache_dir) + results = manager.crawl_coursesel_data(jsessionid, apis_to_crawl) + + if results: + print(f"\n[+] Successfully crawled {len(results)} API endpoints") + for api_name, data in results.items(): + print(f" - {api_name}: {len(data) if data else 0} records") + else: + print("[-] No data was crawled") except Exception as e: print(f"[-] Crawling failed: {str(e)}") @@ -556,31 +847,23 @@ def crawl_coursesel_workflow(cache): def crawl_official_workflow(cache): """Official website crawling workflow""" - import asyncio - from apps.spider.crawlers.orc import CourseSelCrawler - print("\n" + "=" * 40) print("Official Website Crawling") print("=" * 40) print("\n[*] Crawling official website data...") - crawler = CourseSelCrawler() - try: - # Get official website data (async) - official_data = asyncio.run(crawler._get_official_website_data_async()) - print(f"[+] Retrieved {len(official_data)} courses from official website") - - # Convert to list format for saving - official_list = [] - for course_code, course_info in official_data.items(): - course_info["course_code"] = course_code - official_list.append(course_info) + # Use CrawlerManager for organized crawling + manager = CrawlerManager(cache.cache_dir) + official_data = manager.crawl_official_data() - # Save to jsonl file - filepath = cache.save_to_jsonl(official_list, "official") - print(f"[+] Successfully saved to: {Path(filepath).name}") + if official_data: + print( + f"[+] Successfully crawled {len(official_data)} courses from official website" + ) + else: + print("[-] No data was crawled from official website") except Exception as e: print(f"[-] Official website crawling failed: {str(e)}") @@ -663,95 +946,43 @@ def import_workflow(cache): def integrate_and_import_data(cache): """Integrate data from multiple cache files and import to database""" - from apps.spider.crawlers.orc import CourseSelCrawler - print("\n" + "=" * 40) print("Data Integration and Import") print("=" * 40) - files = cache.list_cache_files() - - # Load the most recent files of each type - lesson_tasks_data = [] - course_catalog_data = {} - prerequisites_data = {} - official_data = {} - - print("\n[*] Loading cache files...") - - for filepath in files: - filename = filepath.name - data = cache.load_data_file(filepath) - - if "coursesel_lesson_tasks" in filename: - # Handle JSON format with nested structure - if isinstance(data, dict) and "data" in data: - lesson_tasks_data = data["data"].get("lessonTasks", []) - elif isinstance(data, list): - lesson_tasks_data = data - else: - lesson_tasks_data = [] - print(f"[+] Loaded lesson tasks: {len(lesson_tasks_data)} records") - elif "coursesel_course_catalog" in filename: - # Convert list back to dict - course_catalog_data = { - item.get("courseId"): item for item in data if item.get("courseId") - } - print(f"[+] Loaded course catalog: {len(data)} records") - elif "coursesel_prerequisites" in filename: - # Group prerequisites by courseId - from collections import defaultdict - - prerequisites_data = defaultdict(list) - for item in data: - course_id = item.get("courseId") - if course_id: - prerequisites_data[course_id].append(item) - print(f"[+] Loaded prerequisites: {len(data)} records") - elif "official" in filename: - # Convert list back to dict - for item in data: - course_code = item.get("course_code") - if course_code: - official_data[course_code] = item - print(f"[+] Loaded official data: {len(data)} records") - - if not any( - [lesson_tasks_data, course_catalog_data, prerequisites_data, official_data] - ): - print("[-] No valid data found to integrate") - return - - print("\n[*] Integrating data...") - - # Use the crawler's integration logic - crawler = CourseSelCrawler() - integrated_data = crawler._integrate_course_data( - lesson_tasks_data, course_catalog_data, prerequisites_data, official_data - ) - - print(f"[+] Integrated {len(integrated_data)} course records") + try: + # Use CrawlerManager for organized integration + manager = CrawlerManager(cache.cache_dir) - # Save integrated data - integrated_filepath = cache.save_to_jsonl(integrated_data, "integrated") - print(f"[+] Saved integrated data to: {Path(integrated_filepath).name}") + # Check if we have any cached data + files = cache.list_cache_files() + if not files: + print("[-] No cache files found to integrate") + return - # Ask user if they want to import to database - import_choice = input("\nImport to database? (y/n): ").strip().lower() - if import_choice in ["y", "yes"]: - try: - print("\n[*] Importing to database...") - # Use the actual database import function - from apps.spider.crawlers.orc import import_department + print(f"[*] Found {len(files)} cache files") + for filepath in files: + print(f" - {filepath.name}") + + # Integrate and import data + print("\n[*] Starting integration and import process...") + results = manager.integrate_and_import_data(import_to_db=True) + + if results: + print("\n[+] Integration and import completed!") + print(f" Integrated courses: {results.get('integrated_count', 0)}") + if "success" in results: + print(f" Database import - Success: {results['success']}") + print(f" Database import - Errors: {results['errors']}") + print(f" Database import - Total: {results['total']}") + else: + print("[-] Integration failed or no data to process") - result = import_department(integrated_data) + except Exception as e: + print(f"[-] Integration and import failed: {str(e)}") + import traceback - print(f"[+] Database import completed!") - print(f" Success: {result['success']} courses") - print(f" Errors: {result['errors']} courses") - print(f" Total processed: {result['success'] + result['errors']}") - except Exception as e: - print(f"[-] Database import failed: {str(e)}") + traceback.print_exc() if __name__ == "__main__": diff --git a/apps/spider/utils.py b/apps/spider/utils.py index 517c34b..abb45d1 100644 --- a/apps/spider/utils.py +++ b/apps/spider/utils.py @@ -35,3 +35,16 @@ def retrieve_soup(url, data=None, preprocess=lambda x: x): data = data.encode("utf-8") with urllib_request.urlopen(url, data=data) as response: return BeautifulSoup(preprocess(response.read().decode("utf-8")), "html.parser") + + +def extract_prerequisites(pre_requisites): + """Process prerequisite string format (legacy function)""" + result = pre_requisites + + result = result.replace("Pre-requisites:", "").strip() + result = result.replace("Obtained Credit", "obtained_credit").strip() + result = result.replace("Credits Submitted", "credits_submitted").strip() + result = result.replace("&&", " && ").strip() + result = result.replace("||", " || ").strip() + + return result From 78d3cb8519568bf7bf4d764bca280514f372c394 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 02:47:15 +0800 Subject: [PATCH 16/20] fix(spider/manager): fix the substitution --- apps/spider/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/spider/manager.py b/apps/spider/manager.py index 0936d7c..3f65c6f 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -804,7 +804,7 @@ def crawl_coursesel_workflow(cache): else: # Split by comma (both English and Chinese commas) and clean up api_choice = api_choice.replace( - ",", "," + ",", "," ) # Replace Chinese comma with English comma choices = [choice.strip() for choice in api_choice.split(",")] for choice in choices: From f721d0024254d90c21e6881c208c202708b0e8a3 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 03:11:33 +0800 Subject: [PATCH 17/20] docs(spider): update README.md for spider structure --- apps/spider/README.md | 411 ++++++++++++++++++++++++++++++++++++++++++ apps/spider/tasks.py | 253 +++++++++++++++++++++----- 2 files changed, 618 insertions(+), 46 deletions(-) diff --git a/apps/spider/README.md b/apps/spider/README.md index e69de29..cd7a9c8 100644 --- a/apps/spider/README.md +++ b/apps/spider/README.md @@ -0,0 +1,411 @@ +# Spider Module Documentation + +The Spider module is responsible for crawling, processing, and importing course data from multiple sources into the CourseReview system. + +## Architecture Overview + +``` +apps/spider/ +├── manager.py # Unified crawler management (CrawlerManager) +├── models.py # Data models and state management +├── tasks.py # Asynchronous task definitions +├── views.py # Admin interface views +├── utils.py # Common utility functions +├── crawlers/ # Crawler implementations +│ ├── orc.py # Main crawler system (3 spider classes and 1 intergation class) +│ ├── medians.py # Grade medians crawler (deprecated) +│ ├── timetable.py # Course timetable crawler (deprecated) +│ └── data_cache/ # Data cache directory +├── templates/ # HTML templates for admin interface +└── migrations/ # Database migration files +``` + +## Core Components + +### 1. CrawlerManager (manager.py) + +The central coordination hub for all crawling operations. + +**Key Methods:** +- `crawl_coursesel_data()` - Crawls course selection API data +- `crawl_official_data()` - Crawls official website data +- `integrate_and_import_data()` - Integrates cached data and imports to database +- `_load_cached_data()` - Loads data from cache files + +**Usage:** +```python +from apps.spider.manager import CrawlerManager + +manager = CrawlerManager() +manager.crawl_coursesel_data() +manager.crawl_official_data() +manager.integrate_and_import_data() +``` + +### 2. Three-Class Crawler System (crawlers/orc.py) + +Modern modular architecture with specialized crawler classes: + +#### CourseSelAPICrawler +- Crawls course selection system APIs +- Handles lesson tasks, course catalog, and prerequisites +- Outputs: `lesson_tasks.json`, `course_catalog.jsonl`, `prerequisites.jsonl` + +#### OfficialWebsiteCrawler +- Crawls official JI website course pages +- Extracts course descriptions, topics, and instructor information +- Outputs: `official.jsonl` + +#### CourseDataIntegrator +- Integrates data from multiple sources with priority system +- Merges course information intelligently +- Handles instructor extraction with fallback priorities: + 1. Lesson tasks data (most current) + 2. Course catalog data + 3. Official website data (fallback) + +### 3. Data Models (models.py) + +#### CrawledData Model +Manages the lifecycle of crawled data with approval workflow. + +**Key Fields:** +- `pending_data` - Newly crawled data awaiting approval +- `current_data` - Currently approved data +- `data_type` - Type of crawled data (ORC_DEPARTMENT_COURSES, etc.) +- `resource` - Resource identifier + +**Key Methods:** +- `has_change()` - Detects if pending data differs from current +- `approve_change()` - Approves pending data for import +- `email_change()` - Sends notification emails for data changes + +### 4. Cache Management + +The system uses a sophisticated caching strategy: + +**Cache Directory:** `crawlers/data_cache/` + +**File Types:** +- `lesson_tasks.json` - Course selection API lesson tasks +- `course_catalog.jsonl` - Course catalog data (one JSON per line) +- `prerequisites.jsonl` - Course prerequisite relationships +- `official.jsonl` - Official website course data + +### 5. Asynchronous Tasks (tasks.py) + +The tasks module provides Celery-based asynchronous task processing for both scheduled automation and manual data import operations. + +#### Scheduled Crawling Tasks +- `crawl_coursesel_data(jsessionid=None)` - Crawls CourseSelection API data +- `crawl_official_data()` - Crawls official website course data +- `integrate_and_import_data()` - Integrates cached data and imports to database +- `full_crawl_and_import_workflow()` - Complete end-to-end workflow + +#### Data Import Tasks +- `import_pending_crawled_data(crawled_data_pk)` - Imports approved crawled data to database + +#### Task Architecture +The tasks serve as the automation layer that wraps the core business logic in manager.py: + +```python +# Scheduled automation (via Celery Beat) +@shared_task +def crawl_official_data(): + manager = CrawlerManager() + return manager.crawl_official_data() + +# Manual import processing +@shared_task +def import_pending_crawled_data(crawled_data_pk): + # Safely import approved data with transaction protection + crawled_data = CrawledData.objects.get(pk=crawled_data_pk) + manager = CrawlerManager() + results = manager._import_to_database(crawled_data.pending_data) + crawled_data.current_data = crawled_data.pending_data + crawled_data.save() +``` + +#### Scheduled Execution +Tasks are automatically scheduled via Celery Beat configuration in `website/celery.py`: + +```python +app.conf.beat_schedule = { + "crawl_official_data": { + "task": "apps.spider.tasks.crawl_official_data", + "schedule": crontab(minute=0, hour=1), # Daily at 1:00 AM + }, + "integrate_and_import_data": { + "task": "apps.spider.tasks.integrate_and_import_data", + "schedule": crontab(minute=0, hour=2), # Daily at 2:00 AM + }, +} +``` + +### 6. Admin Interface (views.py) + +Web-based interface for managing crawled data with approval workflow. + +#### View Functions +- `crawled_data_list(request)` - Lists all crawled data with approval status +- `crawled_data_detail(request, crawled_data_pk)` - Detailed view for individual crawled data items + +#### Access Control +Both views require superuser permissions: +```python +@staff_member_required +@user_passes_test(lambda u: u.is_superuser) +``` + +#### Data Approval Workflow + +**List View Features:** +- Displays all crawled data records sorted by update time +- Shows pending changes status for each record +- "Approve All Pending" button for batch approval +- Click-through navigation to detailed views + +**Detail View Features:** +- Shows comprehensive data differences using unified diff format +- Individual approval button for single record approval +- JSON formatted display of current vs pending data +- Navigation back to list view + +#### Approval Process Flow + +```mermaid +graph TD + A[Crawler detects data changes] --> B[Data saved as pending_data] + B --> C[Email notification sent to admins] + C --> D[Admin opens web interface] + D --> E[Reviews data differences] + E --> F[Clicks Approve button] + F --> G[approve_change method called] + G --> H[Async import task queued] + H --> I[Data imported to database] + I --> J[pending_data becomes current_data] +``` + +#### Auto-Import vs Manual Approval + +The system supports two modes via `AUTO_IMPORT_CRAWLED_DATA` setting: + +**Auto-Import Mode** (`AUTO_IMPORT_CRAWLED_DATA = True`): +- New data automatically approved and imported +- Admin interface used for monitoring and reviewing changes +- Email notifications still sent for transparency + +**Manual Approval Mode** (`AUTO_IMPORT_CRAWLED_DATA = False`): +- All data changes require explicit admin approval +- Critical for production environments requiring human oversight +- Admin interface becomes essential workflow component + +## Data Flow + +### Automated Workflow (Scheduled Tasks) +``` +1. Celery Beat scheduler → tasks.py (crawl_official_data, etc.) +2. Tasks call manager.py → CrawlerManager coordinates operations +3. Specialized crawlers → orc.py classes execute crawling +4. Data cached to files → data_cache/ (JSON/JSONL files) +5. Integration processing → CourseDataIntegrator merges multi-source data +6. Change detection → models.py (CrawledData.has_change()) +7. Auto-approval check → settings.AUTO_IMPORT_CRAWLED_DATA + - If True: Direct import → Step 9 + - If False: Pending approval → Step 8 +8. Admin notification → Email sent, admin uses views.py interface +9. Database import → import_pending_crawled_data task +10. Final storage → Course, Instructor, CourseOffering models +``` + +### Manual Workflow (Interactive) +``` +1. Developer runs → python apps/spider/manager.py +2. Interactive menu → Choose crawling options +3. Direct execution → CrawlerManager methods called immediately +4. Real-time feedback → Progress displayed in terminal +5. Optional import → Choose to import or just cache data +``` + +### Data Approval Workflow +``` +1. New data detected → CrawledData.handle_new_crawled_data() +2. Diff generation → Compare pending vs current data +3. Email notification → Send change summary to admins +4. Approval decision → Auto-import OR manual review +5. Manual approval → Admin uses views.py interface +6. Import execution → import_pending_crawled_data task triggered +7. Status update → pending_data becomes current_data +``` + +## Key Features + +### Multi-Source Data Integration +- **Priority-based instructor extraction**: Combines data from multiple sources with intelligent fallback +- **Comprehensive course information**: Merges descriptions, topics, credits, and prerequisites +- **Data validation**: Ensures data quality and consistency + +### Robust Error Handling +- **Graceful degradation**: System continues working even if some sources fail +- **Comprehensive logging**: Detailed logs for debugging and monitoring +- **Email notifications**: Automatic alerts for data changes and errors + +### Scalable Architecture +- **Modular design**: Easy to add new crawlers or data sources +- **Asynchronous processing**: Non-blocking operations using Celery +- **Caching system**: Efficient data storage and retrieval + +## Configuration + +### Crawler Settings +Key configuration options in `orc.py`: + +```python +class CrawlerConfig: + MAX_CONCURRENT_REQUESTS = 20 + REQUEST_DELAY = 0.1 + MAX_COURSE_TOPICS = 10 + TIMEOUT = 30 +``` + +### API Endpoints +- CourseSelection API: `https://coursesel.ji.sjtu.edu.cn/` +- Official Website: `https://www.ji.sjtu.edu.cn/academics/courses/` + +## Usage Examples + +### Manual Crawling +```python +# Complete crawling workflow +from apps.spider.manager import CrawlerManager + +manager = CrawlerManager() + +# Step 1: Crawl course selection data (requires authentication) +coursesel_data = manager.crawl_coursesel_data(jsessionid="your_session_id") +print(f"Crawled {len(coursesel_data)} courses from CourseSelection API") + +# Step 2: Crawl official website data +official_data = manager.crawl_official_data() +print(f"Crawled {len(official_data)} courses from official website") + +# Step 3: Integrate and import to database +results = manager.integrate_and_import_data() +print(f"Successfully imported {results['success']} courses") +``` + +### Interactive Interface +```bash +# Run interactive spider manager +python apps/spider/manager.py + +# Menu options: +# 1. Crawl data from websites +# 2. Import data from cache files +# 3. View cache files +# 4. Clean cache files +# 5. Exit +``` + +### Asynchronous Tasks +```python +# Manual task execution (for testing) +from apps.spider.tasks import * + +# Individual crawling tasks +result = crawl_official_data.delay() +print(f"Task ID: {result.id}") + +# Complete workflow +workflow_result = full_crawl_and_import_workflow.delay() +print(f"Workflow started: {workflow_result.id}") + +# Check task status +from celery.result import AsyncResult +task_result = AsyncResult(result.id) +print(f"Status: {task_result.status}") +print(f"Result: {task_result.result}") +``` + +### Scheduled Task Management +```bash +# Start Celery worker (processes tasks) +celery -A website worker --loglevel=info + +# Start Celery beat (schedules tasks) +celery -A website beat --loglevel=info + +# Monitor tasks via Flower (optional) +pip install flower +celery -A website flower +# Visit http://localhost:5555 +``` + +### Admin Interface Usage +```bash +# Access via web browser +# URL: /admin/spider/crawled-data/ + +# Or direct URLs: +# List view: /spider/crawled-data/ +# Detail view: /spider/crawled-data/{id}/ +``` + +**Admin Interface Features:** +- View all crawled data with pending change indicators +- Compare current vs pending data with diff view +- Approve individual records or batch approve all +- Monitor data import status and history + +### API-Specific Crawling +```python +# Individual API crawling +from apps.spider.crawlers.orc import CourseSelAPICrawler, OfficialWebsiteCrawler + +# CourseSelection API only +api_crawler = CourseSelAPICrawler() +lesson_tasks = await api_crawler.crawl_lesson_tasks() +catalog_data = await api_crawler.crawl_course_catalog() + +# Official website only +website_crawler = OfficialWebsiteCrawler() +official_data = await website_crawler.crawl_official_data() +``` + +## Deprecated Components + +### medians.py (Deprecated) +- Originally crawled grade median data from Dartmouth +- No longer relevant for JI system +- Should be removed in future cleanup + +### timetable.py (Deprecated) +- Originally crawled course timetable from Dartmouth Oracle system +- Replaced by CourseSelection API data +- Should be removed in future cleanup + +## Maintenance + +### Regular Tasks +1. **Monitor cache files**: Ensure cache directory has sufficient space +2. **Review crawled data**: Use admin interface to approve new data +3. **Check error logs**: Monitor for crawling failures or data inconsistencies +4. **Update configurations**: Adjust crawler settings based on performance + +### Troubleshooting +- **Connection issues**: Check API endpoints and network connectivity +- **Data parsing errors**: Verify website structure hasn't changed +- **Import failures**: Check database constraints and data validation + +## Future Enhancements + +### Planned Improvements +1. **Remove deprecated crawlers**: Clean up medians.py and timetable.py +2. **Enhanced error recovery**: Implement retry mechanisms for failed requests +3. **Real-time monitoring**: Add metrics and dashboards for crawler performance +4. **API rate limiting**: Implement intelligent rate limiting for source APIs + +### Extension Points +- **New data sources**: Easy to add additional crawler classes +- **Custom data processors**: Pluggable data transformation pipeline +- **Enhanced caching**: Implement distributed caching for better performance \ No newline at end of file diff --git a/apps/spider/tasks.py b/apps/spider/tasks.py index 04b3a92..da354ea 100644 --- a/apps/spider/tasks.py +++ b/apps/spider/tasks.py @@ -2,7 +2,6 @@ from celery import shared_task from django.db import transaction -from apps.spider.crawlers import orc from apps.spider.models import CrawledData from lib import task_utils @@ -14,68 +13,230 @@ @task_utils.email_if_fails @transaction.atomic def import_pending_crawled_data(crawled_data_pk): + """ + Import pending crawled data to database + + Args: + crawled_data_pk (int): Primary key of CrawledData record to import + """ crawled_data = CrawledData.objects.select_for_update().get(pk=crawled_data_pk) # if crawled_data.data_type == CrawledData.MEDIANS: # medians.import_medians(crawled_data.pending_data) # elif if crawled_data.data_type == CrawledData.ORC_DEPARTMENT_COURSES: - orc.import_department(crawled_data.pending_data) - # else: - # assert crawled_data.data_type == CrawledData.COURSE_TIMETABLE - # timetable.import_timetable(crawled_data.pending_data) - crawled_data.current_data = crawled_data.pending_data - crawled_data.save() + # Use manager's import functionality + from apps.spider.manager import CrawlerManager + manager = CrawlerManager() -# @shared_task -# @task_utils.email_if_fails -# def crawl_medians(): -# median_page_urls = medians.crawl_median_page_urls() -# assert len(median_page_urls) == 10 -# # the registrar medians web page always keeps a list links to the past ten academic terms -# for url in median_page_urls: -# crawl_term_median_page.delay(url) -# return median_page_urls + # Import the pending data + results = manager._import_to_database(crawled_data.pending_data) + print(f"Import results: {results}") -# @shared_task -# @task_utils.email_if_fails -# def crawl_term_median_page(url): -# new_data = medians.crawl_term_medians_for_url(url) -# resource_name = "{term}_medians".format( -# term=medians.get_term_from_median_page_url(url), ) -# return CrawledData.objects.handle_new_crawled_data(new_data, resource_name, -# CrawledData.MEDIANS) + # Mark data as current after successful import + crawled_data.current_data = crawled_data.pending_data + crawled_data.save() + + return results + else: + print(f"Unsupported data type: {crawled_data.data_type}") + return False + + +@shared_task +@task_utils.email_if_fails +def crawl_coursesel_data(jsessionid=None): + """ + Scheduled task to crawl CourseSelection API data + + Args: + jsessionid (str, optional): Session ID for authentication. + If None, will need to be provided via other means. + + Returns: + dict: Results of crawling operation + """ + try: + from apps.spider.manager import CrawlerManager + + manager = CrawlerManager() + + if not jsessionid: + # For automated tasks, we might need to handle authentication differently + # This could be configured via environment variables or settings + print("Warning: No jsessionid provided for automated crawling") + return { + "status": "skipped", + "message": "No authentication provided for CourseSelection API crawling", + } + + print("Starting CourseSelection API crawling...") + coursesel_data = manager.crawl_coursesel_data(jsessionid) + + result = { + "status": "success", + "courses_found": len(coursesel_data), + "message": f"Successfully crawled {len(coursesel_data)} courses from CourseSelection API", + } + print(result["message"]) + return result + + except Exception as e: + error_msg = f"CourseSelection API crawling failed: {str(e)}" + print(error_msg) + return {"status": "error", "message": error_msg} @shared_task @task_utils.email_if_fails -def crawl_orc(): - print("Starting crawl_orc") - # crawl_program_url.delay(orc.SUPPLEMENT_URL, "supplement") - program_urls = orc.crawl_program_urls() - print(f"Found {len(program_urls)} program URLs") - # assert len(program_urls) > 50 - for url in program_urls: - crawl_program_url.delay(url) - return sorted(program_urls) +def crawl_official_data(): + """ + Scheduled task to crawl official website data + + Returns: + dict: Results of crawling operation + """ + try: + from apps.spider.manager import CrawlerManager + + manager = CrawlerManager() + + print("Starting official website crawling...") + official_data = manager.crawl_official_data() + + result = { + "status": "success", + "courses_found": len(official_data), + "message": f"Successfully crawled {len(official_data)} courses from official website", + } + print(result["message"]) + return result + + except Exception as e: + error_msg = f"Official website crawling failed: {str(e)}" + print(error_msg) + return {"status": "error", "message": error_msg} @shared_task @task_utils.email_if_fails -def crawl_program_url(url, program_code=None): - # if not program_code: - # program_code = url.split("/")[-1].split("-")[0] - # assert program_code.isupper() and len(program_code) in (3, 4) - # resource_name = "{program_code}_{education_level_code}_courses".format( - # program_code=program_code.lower(), - # education_level_code=orc.get_education_level_code(url), - # ) - resource_name = "orc_department_courses" - new_data = [orc._crawl_course_data(url)] - return CrawledData.objects.handle_new_crawled_data( - new_data, resource_name, CrawledData.ORC_DEPARTMENT_COURSES - ) +def integrate_and_import_data(): + """ + Scheduled task to integrate cached data and import to database + + Returns: + dict: Results of integration and import operation + """ + try: + from apps.spider.manager import CrawlerManager + + manager = CrawlerManager() + + print("Starting data integration and import...") + results = manager.integrate_and_import_data() + + if results: + result = { + "status": "success", + "imported_courses": results.get("success", 0), + "failed_imports": results.get("errors", 0), + "total_processed": results.get("total", 0), + "message": f"Successfully imported {results.get('success', 0)} courses", + } + else: + result = {"status": "warning", "message": "No data to integrate or import"} + + print(result["message"]) + return result + except Exception as e: + error_msg = f"Data integration and import failed: {str(e)}" + print(error_msg) + return {"status": "error", "message": error_msg} + + +@shared_task +@task_utils.email_if_fails +def full_crawl_and_import_workflow(): + """ + Complete workflow: crawl all sources and import data + + This task orchestrates the entire crawling and import process: + 1. Crawl CourseSelection API data + 2. Crawl official website data + 3. Integrate and import all data + + Returns: + dict: Results of complete workflow + """ + workflow_results = { + "coursesel_crawl": None, + "official_crawl": None, + "integration": None, + "overall_status": "pending", + } + + try: + # Step 1: Crawl CourseSelection API + print("Step 1: Crawling CourseSelection API...") + workflow_results["coursesel_crawl"] = crawl_coursesel_data(jsessionid=None) + + # Step 2: Crawl official website + print("Step 2: Crawling official website...") + workflow_results["official_crawl"] = crawl_official_data() + + # Step 3: Integrate and import data + print("Step 3: Integrating and importing data...") + workflow_results["integration"] = integrate_and_import_data() + + # Determine overall status + all_successful = all( + result.get("status") == "success" + for result in workflow_results.values() + if result is not None + ) + + workflow_results["overall_status"] = ( + "success" if all_successful else "partial_success" + ) + + total_courses = workflow_results["integration"].get("imported_courses", 0) + print(f"Workflow completed. Total courses imported: {total_courses}") + + return workflow_results + + except Exception as e: + error_msg = f"Full crawl workflow failed: {str(e)}" + print(error_msg) + workflow_results["overall_status"] = "error" + workflow_results["error_message"] = error_msg + return workflow_results + + +# Legacy task functions - kept for compatibility but deprecated +# These should be removed in future versions + +# @shared_task +# @task_utils.email_if_fails +# def crawl_orc(): +# """DEPRECATED: Use crawl_coursesel_data and crawl_official_data instead""" +# print("WARNING: crawl_orc is deprecated. Use new crawler tasks instead.") +# return full_crawl_and_import_workflow() + +# @shared_task +# @task_utils.email_if_fails +# def crawl_program_url(url, program_code=None): +# """DEPRECATED: Individual URL crawling no longer supported""" +# print("WARNING: crawl_program_url is deprecated.") +# return {"status": "deprecated", "message": "This function is no longer supported"} + + +# Commented out deprecated tasks for medians and timetable +# @shared_task +# @task_utils.email_if_fails +# def crawl_medians(): +# """DEPRECATED: Medians crawling for Dartmouth - not applicable to JI""" +# pass # @shared_task # @task_utils.email_if_fails From 034852123e9d27f5c892a3880d01ae58c8a94bef Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 04:11:57 +0800 Subject: [PATCH 18/20] refactor(ocr): delete useless functions --- apps/spider/crawlers/orc.py | 45 +++++-------------------------------- apps/spider/manager.py | 12 +++++----- scripts/__init__.py | 1 - 3 files changed, 13 insertions(+), 45 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index fa2ef99..e188fd4 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -18,6 +18,7 @@ # Set up logger logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) class CrawlerConfig: @@ -86,15 +87,6 @@ class CrawlerConfig: } -# Legacy constants for backward compatibility -BASE_URL = CrawlerConfig.COURSESEL_BASE_URL -COURSE_DETAIL_URL_PREFIX = urllib.parse.urljoin(BASE_URL, "/course/") -OFFICIAL_BASE_URL = CrawlerConfig.OFFICIAL_BASE_URL -OFFICIAL_ORC_BASE_URL = CrawlerConfig.OFFICIAL_ORC_BASE_URL -OFFICIAL_COURSE_DETAIL_URL_PREFIX = CrawlerConfig.OFFICIAL_COURSE_DETAIL_URL_PREFIX -OFFICIAL_UNDERGRAD_URL = OFFICIAL_ORC_BASE_URL - - class CourseSelAPICrawler: """ Course Selection System API Crawler @@ -155,35 +147,6 @@ def set_session_id(self, jsessionid): self._initialized = False self._ensure_initialized() - def crawl_all_apis(self, apis=None): - """ - Crawl data from all or specified APIs - - Args: - apis (list, optional): List of API names to crawl. - If None, crawl all APIs. - - Returns: - dict: Dictionary containing data from each API - """ - if apis is None: - apis = CrawlerConfig.COURSESEL_APIS - - self._ensure_initialized() - - results = {} - - if "lesson_tasks" in apis: - results["lesson_tasks"] = self.crawl_lesson_tasks() - - if "course_catalog" in apis: - results["course_catalog"] = self.crawl_course_catalog() - - if "prerequisites" in apis: - results["prerequisites"] = self.crawl_prerequisites() - - return results - def crawl_lesson_tasks(self): """ Crawl lesson task data from the course selection API @@ -1149,4 +1112,8 @@ def _build_course_url(self, main_data): if main_data is None: main_data = {} course_id = main_data.get("courseId") - return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else "" + return ( + f"{CrawlerConfig.COURSESEL_BASE_URL}/course/{course_id}" + if course_id + else "" + ) diff --git a/apps/spider/manager.py b/apps/spider/manager.py index 3f65c6f..9b0ae91 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -125,11 +125,11 @@ def crawl_official_data(self): dict: Dictionary with official website data """ if not self.website_crawler: - print("[-] Official website crawler not available") + print(MessageConstants.OFFICIAL_WEBSITE_CRAWLER_NOT_AVAILABLE) return {} try: - print("[*] Crawling official website...") + print(MessageConstants.CRAWLING_OFFICIAL_WEBSITE) official_data = asyncio.run(self.website_crawler.crawl_official_data()) print(f"[+] Retrieved {len(official_data)} courses from official website") @@ -137,16 +137,18 @@ def crawl_official_data(self): if official_data: official_list = [] for course_code, course_info in official_data.items(): - course_info["course_code"] = course_code + course_info[FieldConstants.COURSE_CODE] = course_code official_list.append(course_info) - filepath = self.cache.save_to_jsonl(official_list, "official") + filepath = self.cache.save_to_jsonl( + official_list, FileConstants.OFFICIAL + ) print(f"[+] Saved to cache: {Path(filepath).name}") return official_data except Exception as e: - print(f"[-] Official website crawling failed: {e}") + print(MessageConstants.OFFICIAL_WEBSITE_CRAWLING_FAILED.format(e)) return {} def integrate_and_import_data(self, import_to_db=True): diff --git a/scripts/__init__.py b/scripts/__init__.py index 9741121..5928603 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -3,7 +3,6 @@ from apps.spider.models import CrawledData # from apps.spider.tasks import crawl_medians, crawl_orc, crawl_timetable -from apps.spider.tasks import crawl_orc from apps.spider.utils import retrieve_soup from apps.web.models import Course, CourseOffering, Instructor from lib.constants import CURRENT_TERM From 7c52584b50a65f597d39a3d6429b795371739137 Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 04:35:29 +0800 Subject: [PATCH 19/20] refactor(ocr): delete useless functions --- apps/spider/crawlers/orc.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/apps/spider/crawlers/orc.py b/apps/spider/crawlers/orc.py index e188fd4..e1b1fd3 100644 --- a/apps/spider/crawlers/orc.py +++ b/apps/spider/crawlers/orc.py @@ -136,17 +136,6 @@ def _ensure_initialized(self): self._initialized = True logger.info("CourseSelAPICrawler initialized successfully") - def set_session_id(self, jsessionid): - """ - Set or update the session ID - - Args: - jsessionid (str): New session ID - """ - self.jsessionid = jsessionid - self._initialized = False - self._ensure_initialized() - def crawl_lesson_tasks(self): """ Crawl lesson task data from the course selection API @@ -335,8 +324,6 @@ async def _crawl_official_data_async(self): # Get all course URLs from official website official_urls = self._get_official_course_urls() - logger.info(f"Found {len(official_urls)} official course URLs") - if not official_urls: logger.warning("No official course URLs found") return {} From 137aac73abb28701202704e07f1853f451f8d8aa Mon Sep 17 00:00:00 2001 From: Leqi Tang Date: Sun, 12 Oct 2025 04:47:59 +0800 Subject: [PATCH 20/20] refactor(manager): delete useless import --- apps/spider/manager.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/apps/spider/manager.py b/apps/spider/manager.py index 9b0ae91..eec784e 100644 --- a/apps/spider/manager.py +++ b/apps/spider/manager.py @@ -11,12 +11,6 @@ from datetime import datetime from pathlib import Path -# Setup Django environment -project_root = Path(__file__).parent.parent.parent -sys.path.append(str(project_root)) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "website.settings") -django.setup() - class CrawlerManager: """ @@ -125,11 +119,11 @@ def crawl_official_data(self): dict: Dictionary with official website data """ if not self.website_crawler: - print(MessageConstants.OFFICIAL_WEBSITE_CRAWLER_NOT_AVAILABLE) + print("[-] Official website crawler not available") return {} try: - print(MessageConstants.CRAWLING_OFFICIAL_WEBSITE) + print("[*] Crawling official website...") official_data = asyncio.run(self.website_crawler.crawl_official_data()) print(f"[+] Retrieved {len(official_data)} courses from official website") @@ -137,18 +131,16 @@ def crawl_official_data(self): if official_data: official_list = [] for course_code, course_info in official_data.items(): - course_info[FieldConstants.COURSE_CODE] = course_code + course_info["course_code"] = course_code official_list.append(course_info) - filepath = self.cache.save_to_jsonl( - official_list, FileConstants.OFFICIAL - ) + filepath = self.cache.save_to_jsonl(official_list, "official") print(f"[+] Saved to cache: {Path(filepath).name}") return official_data except Exception as e: - print(MessageConstants.OFFICIAL_WEBSITE_CRAWLING_FAILED.format(e)) + print(f"[-] Official website crawling failed: {e}") return {} def integrate_and_import_data(self, import_to_db=True):