diff --git a/derex/runner/project.py b/derex/runner/project.py index 25f83dbb2..6b735e4ae 100644 --- a/derex/runner/project.py +++ b/derex/runner/project.py @@ -236,6 +236,10 @@ def mongodb_db_name(self) -> str: def mongodb_user(self) -> str: return self.config.get("mongodb_user", MONGODB_ROOT_USER) + @property + def elasticsearch_prefix(self) -> str: + return self.config.get("elasticsearch_prefix", f"{self.name}") + @property def runmode(self) -> ProjectRunMode: """The run mode of this project, either debug or production. diff --git a/derex/runner/templates/Dockerfile-project.j2 b/derex/runner/templates/Dockerfile-project.j2 index 0731ac733..40b86640a 100644 --- a/derex/runner/templates/Dockerfile-project.j2 +++ b/derex/runner/templates/Dockerfile-project.j2 @@ -13,6 +13,7 @@ ENV MYSQL_PASSWORD={{ project.secret("mysql") }} ENV MONGODB_DB_NAME={{ project.mongodb_db_name }} ENV MONGODB_USER={{ project.mongodb_user }} ENV MONGODB_PASSWORD={{ project.secret("mongodb") }} +ENV ELASTICSEARCH_PREFIX={{ project.elasticsearch_prefix }} ENV DEREX_MINIO_SECRET={{ project.secret("minio") }} {%- for key, value in project.get_container_env().items() %} ENV {{ key }}={{ value | tojson }} diff --git a/derex/runner/templates/docker-compose-project.yml.j2 b/derex/runner/templates/docker-compose-project.yml.j2 index 9cf11d087..e96510eea 100644 --- a/derex/runner/templates/docker-compose-project.yml.j2 +++ b/derex/runner/templates/docker-compose-project.yml.j2 @@ -53,6 +53,7 @@ x-common: MONGODB_DB_NAME: {{ project.mongodb_db_name }} MONGODB_USER: {{ project.mongodb_user }} MONGODB_PASSWORD: {{ project.secret("mongodb") }} + ELASTICSEARCH_PREFIX: {{ project.elasticsearch_prefix }} DEREX_MINIO_SECRET: {{ project.secret("minio") }} {%- for key, value in project.get_container_env().items() %} {{ key }}: {{ value | tojson }} diff --git a/docker-definition/openedx_customizations/juniper/cms/djangoapps/contentstore/courseware_index.py b/docker-definition/openedx_customizations/juniper/cms/djangoapps/contentstore/courseware_index.py new file mode 100644 index 000000000..7574d6223 --- /dev/null +++ b/docker-definition/openedx_customizations/juniper/cms/djangoapps/contentstore/courseware_index.py @@ -0,0 +1,690 @@ +""" Code to allow module store to interface with courseware index """ + +import logging +import os +import re +from abc import ABCMeta, abstractmethod +from datetime import timedelta + +from django.conf import settings +from django.urls import resolve +from django.utils.translation import gettext as _ +from django.utils.translation import gettext_lazy +from eventtracking import tracker +from search.search_engine_base import SearchEngine + +from cms.djangoapps.contentstore.course_group_config import GroupConfiguration +from common.djangoapps.course_modes.models import CourseMode +from openedx.core.lib.courses import course_image_url +from xmodule.annotator_mixin import html_to_text # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.library_tools import normalize_key_for_search # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.modulestore import ModuleStoreEnum # lint-amnesty, pylint: disable=wrong-import-order + +# REINDEX_AGE is the default amount of time that we look back for changes +# that might have happened. If we are provided with a time at which the +# indexing is triggered, then we know it is safe to only index items +# recently changed at that time. This is the time period that represents +# how far back from the trigger point to look back in order to index +REINDEX_AGE = timedelta(0, 60) # 60 seconds + +# INDEXING_REQUEST_TIMEOUT is the number of seconds before a request is considered +# timed out for courseware indexing. +INDEXING_REQUEST_TIMEOUT = 60 + +log = logging.getLogger('edx.modulestore') + + +def strip_html_content_to_text(html_content): + """ Gets only the textual part for html content - useful for building text to be searched """ + # Removing HTML-encoded non-breaking space characters + text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content)) + # Removing HTML CDATA + text_content = re.sub(r"", "", text_content) + # Removing HTML comments + text_content = re.sub(r"", "", text_content) + + return text_content + + +def indexing_is_enabled(): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get('ENABLE_COURSEWARE_INDEX', False) + + +class SearchIndexingError(Exception): + """ Indicates some error(s) occured during indexing """ + + def __init__(self, message, error_list): + super().__init__(message) + self.error_list = error_list + + +class SearchIndexerBase(metaclass=ABCMeta): + """ + Base class to perform indexing for courseware or library search from different modulestores + """ + + INDEX_NAME = None + ENABLE_INDEXING_KEY = None + + INDEX_EVENT = { + 'name': None, + 'category': None + } + + @classmethod + def indexing_is_enabled(cls): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get(cls.ENABLE_INDEXING_KEY, False) + + @classmethod + @abstractmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + + @classmethod + @abstractmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + + @classmethod + @abstractmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id + + @classmethod + def remove_deleted_items(cls, searcher, structure_key, exclude_items): + """ + remove any item that is present in the search index that is not present in updated list of indexed items + as we find items we can shorten the set of items to keep + """ + response = searcher.search( + field_dictionary=cls._get_location_info(structure_key), + exclude_dictionary={"id": list(exclude_items)} + ) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) + + @classmethod + def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE, timeout=INDEXING_REQUEST_TIMEOUT): # lint-amnesty, pylint: disable=line-too-long, too-many-statements + """ + Process course for indexing + + Arguments: + modulestore - modulestore object to use for operations + + structure_key (CourseKey|LibraryKey) - course or library identifier + + triggered_at (datetime) - provides time at which indexing was triggered; + useful for index updates - only things changed recently from that date + (within REINDEX_AGE above ^^) will have their index updated, others skip + updating their index but are still walked through in order to identify + which items may need to be removed from the index + If None, then a full reindex takes place + + Returns: + Number of items that have been added to the index + """ + error_list = [] + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + structure_key = cls.normalize_structure_key(structure_key) + location_info = cls._get_location_info(structure_key) + + # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index` + indexed_count = { + "count": 0 + } + + # indexed_items is a list of all the items that we wish to remain in the + # index, whether or not we are planning to actually update their index. + # This is used in order to build a query to remove those items not in this + # list - those are ready to be destroyed + indexed_items = set() + + # items_index is a list of all the items index dictionaries. + # it is used to collect all indexes and index them using bulk API, + # instead of per item index API call. + items_index = [] + + def get_item_location(item): + """ + Gets the version agnostic item location + """ + return item.location.version_agnostic().replace(branch=None) + + def prepare_item_index(item, skip_index=False, groups_usage_info=None): + """ + Add this item to the items_index and indexed_items list + + Arguments: + item - item to add to index, its children will be processed recursively + + skip_index - simply walk the children in the tree, the content change is + older than the REINDEX_AGE window and would have been already indexed. + This should really only be passed from the recursive child calls when + this method has determined that it is safe to do so + + Returns: + item_content_groups - content groups assigned to indexed item + """ + item_index_dictionary = item.index_dictionary() + # if it's not indexable and it does not have children, then ignore + if not item_index_dictionary and not item.has_children: + return + + item_content_groups = None + + if item.category == "split_test": # lint-amnesty, pylint: disable=too-many-nested-blocks + split_partition = item.get_selected_partition() + for split_test_child in item.get_children(): + if split_partition: + for group in split_partition.groups: + group_id = str(group.id) + child_location = item.group_id_to_child.get(group_id, None) + if child_location == split_test_child.location: + groups_usage_info.update({ + str(get_item_location(split_test_child)): [group_id], + }) + for component in split_test_child.get_children(): + groups_usage_info.update({ + str(get_item_location(component)): [group_id] + }) + + if groups_usage_info: + item_location = get_item_location(item) + item_content_groups = groups_usage_info.get(str(item_location), None) + + item_id = str(cls._id_modifier(item.scope_ids.usage_id)) + indexed_items.add(item_id) + if item.has_children: + # determine if it's okay to skip adding the children herein based upon how recently any may have changed + skip_child_index = skip_index or \ + (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) + children_groups_usage = [] + for child_item in item.get_children(): + if modulestore.has_published_version(child_item): + children_groups_usage.append( + prepare_item_index( + child_item, + skip_index=skip_child_index, + groups_usage_info=groups_usage_info + ) + ) + if None in children_groups_usage: + item_content_groups = None + + if skip_index or not item_index_dictionary: + return + + item_index = {} + # if it has something to add to the index, then add it + try: + item_index.update(location_info) + item_index.update(item_index_dictionary) + item_index['id'] = item_id + if item.start: + item_index['start_date'] = item.start + item_index['content_groups'] = item_content_groups if item_content_groups else None + item_index.update(cls.supplemental_fields(item)) + items_index.append(item_index) + indexed_count["count"] += 1 + return item_content_groups + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not fail on one item of many + log.warning('Could not index item: %s - %r', item.location, err) + error_list.append(_('Could not index item: {}').format(item.location)) + + try: + with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): + structure = cls._fetch_top_level(modulestore, structure_key) + groups_usage_info = cls.fetch_group_usage(modulestore, structure) + + # First perform any additional indexing from the structure object + cls.supplemental_index_information(modulestore, structure) + + # Now index the content + for item in structure.get_children(): + prepare_item_index(item, groups_usage_info=groups_usage_info) + searcher.index(items_index, request_timeout=timeout) + cls.remove_deleted_items(searcher, structure_key, indexed_items) + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not prevent the rest of the application from working + log.exception( + "Indexing error encountered, courseware index may be out of date %s - %r", + structure_key, + err + ) + error_list.append(_('General indexing error occurred')) + + if error_list: + raise SearchIndexingError('Error(s) present during indexing', error_list) + + return indexed_count["count"] + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index all content within the given structure (course or library), + tracking the fact that a full reindex has taken place + """ + indexed_count = cls.index(modulestore, structure_key) + if indexed_count: + cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count) + return indexed_count + + @classmethod + def _track_index_request(cls, event_name, category, indexed_count): + """Track content index requests. + + Arguments: + event_name (str): Name of the event to be logged. + category (str): category of indexed items + indexed_count (int): number of indexed items + Returns: + None + + """ + data = { + "indexed_count": indexed_count, + 'category': category, + } + + tracker.emit( + event_name, + data + ) + + @classmethod + def fetch_group_usage(cls, modulestore, structure): # pylint: disable=unused-argument + """ + Base implementation of fetch group usage on course/library. + """ + return None + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform any supplemental indexing given that the structure object has + already been loaded. Base implementation performs no operation. + + Arguments: + modulestore - modulestore object used during the indexing operation + structure - structure object loaded during the indexing job + + Returns: + None + """ + pass # lint-amnesty, pylint: disable=unnecessary-pass + + @classmethod + def supplemental_fields(cls, item): # pylint: disable=unused-argument + """ + Any supplemental fields that get added to the index for the specified + item. Base implementation returns an empty dictionary + """ + return {} + + +class CoursewareSearchIndexer(SearchIndexerBase): + """ + Class to perform indexing for courseware search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "courseware_content" + else: + INDEX_NAME = "courseware_content" + + ENABLE_INDEXING_KEY = 'ENABLE_COURSEWARE_INDEX' + + INDEX_EVENT = { + 'name': 'edx.course.index.reindexed', + 'category': 'courseware_index' + } + + UNNAMED_MODULE_NAME = gettext_lazy("(Unnamed)") + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return structure_key + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_course(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def do_course_reindex(cls, modulestore, course_key): + """ + (Re)index all content within the given course, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, course_key) + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index course content within the given structure. + + The course_info index is indexed with the courseware_content index. This method + helps to track the fact that course_info reindex has taken place. + """ + indexed_count = super()._do_reindex(modulestore, structure_key) + if indexed_count: + course_about = CourseAboutSearchIndexer + cls._track_index_request( + course_about.INDEX_EVENT['name'], + course_about.INDEX_EVENT['category'], + indexed_count + ) + return indexed_count + + @classmethod + def fetch_group_usage(cls, modulestore, structure): + groups_usage_dict = {} + partitions_info = GroupConfiguration.get_partitions_usage_info(modulestore, structure) + content_group_info = GroupConfiguration.get_content_groups_items_usage_info( + modulestore, + structure + ) + for group_info in (partitions_info, content_group_info): + for groups in group_info.values(): + for name, group in groups.items(): + for module in group: + view, args, kwargs = resolve(module['url']) # pylint: disable=unused-variable + usage_key_string = str(kwargs['usage_key_string']) + if groups_usage_dict.get(usage_key_string, None): + groups_usage_dict[usage_key_string].append(name) + else: + groups_usage_dict[usage_key_string] = [name] + return groups_usage_dict + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform additional indexing from loaded structure object + """ + CourseAboutSearchIndexer.index_about_information(modulestore, structure) + + @classmethod + def supplemental_fields(cls, item): + """ + Add location path to the item object + + Once we've established the path of names, the first name is the course + name, and the next 3 names are the navigable path within the edx + application. Notice that we stop at that level because a full path to + deep children would be confusing. + """ + location_path = [] + parent = item + while parent is not None: + path_component_name = parent.display_name + if not path_component_name: + path_component_name = str(cls.UNNAMED_MODULE_NAME) + location_path.append(path_component_name) + parent = parent.get_parent() + location_path.reverse() + return { + "course_name": location_path[0], + "location": location_path[1:4] + } + + +class LibrarySearchIndexer(SearchIndexerBase): + """ + Base class to perform indexing for library search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "library_index" + else: + INDEX_NAME = "library_index" + + ENABLE_INDEXING_KEY = 'ENABLE_LIBRARY_INDEX' + + INDEX_EVENT = { + 'name': 'edx.library.index.reindexed', + 'category': 'library_index' + } + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return normalize_key_for_search(structure_key) + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_library(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"library": str(normalized_structure_key)} + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None))) + + @classmethod + def do_library_reindex(cls, modulestore, library_key): + """ + (Re)index all content within the given library, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, library_key) + + +class AboutInfo: + """ About info structure to contain + 1) Property name to use + 2) Where to add in the index (using flags above) + 3) Where to source the properties value + """ + # Bitwise Flags for where to index the information + # + # ANALYSE - states that the property text contains content that we wish to be able to find matched within + # e.g. "joe" should yield a result for "I'd like to drink a cup of joe" + # + # PROPERTY - states that the property text should be a property of the indexed document, to be returned with the + # results: search matches will only be made on exact string matches + # e.g. "joe" will only match on "joe" + # + # We are using bitwise flags because one may want to add the property to EITHER or BOTH parts of the index + # e.g. university name is desired to be analysed, so that a search on "Oxford" will match + # property values "University of Oxford" and "Oxford Brookes University", + # but it is also a useful property, because within a (future) filtered search a user + # may have chosen to filter courses from "University of Oxford" + # + # see https://wiki.python.org/moin/BitwiseOperators for information about bitwise shift operator used below + # + ANALYSE = 1 << 0 # Add the information to the analysed content of the index + PROPERTY = 1 << 1 # Add the information as a property of the object being indexed (not analysed) + + def __init__(self, property_name, index_flags, source_from): + self.property_name = property_name + self.index_flags = index_flags + self.source_from = source_from + + def get_value(self, **kwargs): + """ get the value for this piece of information, using the correct source """ + return self.source_from(self, **kwargs) + + def from_about_dictionary(self, **kwargs): + """ gets the value from the kwargs provided 'about_dictionary' """ + about_dictionary = kwargs.get('about_dictionary', None) + if not about_dictionary: + raise ValueError("Context dictionary does not contain expected argument 'about_dictionary'") + + return about_dictionary.get(self.property_name, None) + + def from_course_property(self, **kwargs): + """ gets the value from the kwargs provided 'course' """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return getattr(course, self.property_name, None) + + def from_course_mode(self, **kwargs): + """ fetches the available course modes from the CourseMode model """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return [mode.slug for mode in CourseMode.modes_for_course(course.id)] + + # Source location options - either from the course or the about info + FROM_ABOUT_INFO = from_about_dictionary + FROM_COURSE_PROPERTY = from_course_property + FROM_COURSE_MODE = from_course_mode + + +class CourseAboutSearchIndexer(CoursewareSearchIndexer): + """ + Class to perform indexing of about information from course object + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "course_info" + else: + INDEX_NAME = "course_info" + + INDEX_EVENT = { + 'name': 'edx.course_info.index.reindexed', + 'category': 'course_info' + } + + # List of properties to add to the index - each item in the list is an instance of AboutInfo object + ABOUT_INFORMATION_TO_INCLUDE = [ + AboutInfo("advertised_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("announcement", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("effort", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("display_name", AboutInfo.ANALYSE, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("overview", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("title", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("university", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("number", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("short_description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("key_dates", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("video", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_short", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_extended", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("requirements", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("syllabus", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("textbook", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("faq", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("more_info", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("ocw_links", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("enrollment_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("enrollment_end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("org", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("modes", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_MODE), + AboutInfo("language", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("invitation_only", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("catalog_visibility", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + ] + + @classmethod + def index_about_information(cls, modulestore, course): + """ + Add the given course to the course discovery index + + Arguments: + modulestore - modulestore object to use for operations + + course - course object from which to take properties, locate about information + """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + course_id = str(course.id) + course_info = { + 'id': course_id, + 'course': course_id, + 'content': {}, + 'image_url': course_image_url(course), + } + + # load data for all of the 'about' modules for this course into a dictionary + about_dictionary = { + item.location.block_id: item.data + for item in modulestore.get_items(course.id, qualifiers={"category": "about"}) + } + + about_context = { + "course": course, + "about_dictionary": about_dictionary, + } + + for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE: + # Broad exception handler so that a single bad property does not scupper the collection of others + try: + section_content = about_information.get_value(**about_context) + except: # pylint: disable=bare-except + section_content = None + log.warning( + "Course discovery could not collect property %s for course %s", + about_information.property_name, + course_id, + exc_info=True, + ) + + if section_content: + if about_information.index_flags & AboutInfo.ANALYSE: + analyse_content = section_content + if isinstance(section_content, str): + analyse_content = strip_html_content_to_text(section_content) + course_info['content'][about_information.property_name] = analyse_content + if about_information.index_flags & AboutInfo.PROPERTY: + course_info[about_information.property_name] = section_content + + # Broad exception handler to protect around and report problems with indexing + try: + searcher.index([course_info]) + except: + log.exception( + "Course discovery indexing error encountered, course discovery index may be out of date %s", + course_id, + ) + raise + + log.debug( + "Successfully added %s course to the course discovery index", + course_id + ) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def remove_deleted_items(cls, structure_key): # lint-amnesty, pylint: disable=arguments-differ + """ Remove item from Course About Search_index """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + response = searcher.search(field_dictionary=cls._get_location_info(structure_key)) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) \ No newline at end of file diff --git a/docker-definition/openedx_customizations/koa/cms/djangoapps/contentstore/courseware_index.py b/docker-definition/openedx_customizations/koa/cms/djangoapps/contentstore/courseware_index.py new file mode 100644 index 000000000..7574d6223 --- /dev/null +++ b/docker-definition/openedx_customizations/koa/cms/djangoapps/contentstore/courseware_index.py @@ -0,0 +1,690 @@ +""" Code to allow module store to interface with courseware index """ + +import logging +import os +import re +from abc import ABCMeta, abstractmethod +from datetime import timedelta + +from django.conf import settings +from django.urls import resolve +from django.utils.translation import gettext as _ +from django.utils.translation import gettext_lazy +from eventtracking import tracker +from search.search_engine_base import SearchEngine + +from cms.djangoapps.contentstore.course_group_config import GroupConfiguration +from common.djangoapps.course_modes.models import CourseMode +from openedx.core.lib.courses import course_image_url +from xmodule.annotator_mixin import html_to_text # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.library_tools import normalize_key_for_search # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.modulestore import ModuleStoreEnum # lint-amnesty, pylint: disable=wrong-import-order + +# REINDEX_AGE is the default amount of time that we look back for changes +# that might have happened. If we are provided with a time at which the +# indexing is triggered, then we know it is safe to only index items +# recently changed at that time. This is the time period that represents +# how far back from the trigger point to look back in order to index +REINDEX_AGE = timedelta(0, 60) # 60 seconds + +# INDEXING_REQUEST_TIMEOUT is the number of seconds before a request is considered +# timed out for courseware indexing. +INDEXING_REQUEST_TIMEOUT = 60 + +log = logging.getLogger('edx.modulestore') + + +def strip_html_content_to_text(html_content): + """ Gets only the textual part for html content - useful for building text to be searched """ + # Removing HTML-encoded non-breaking space characters + text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content)) + # Removing HTML CDATA + text_content = re.sub(r"", "", text_content) + # Removing HTML comments + text_content = re.sub(r"", "", text_content) + + return text_content + + +def indexing_is_enabled(): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get('ENABLE_COURSEWARE_INDEX', False) + + +class SearchIndexingError(Exception): + """ Indicates some error(s) occured during indexing """ + + def __init__(self, message, error_list): + super().__init__(message) + self.error_list = error_list + + +class SearchIndexerBase(metaclass=ABCMeta): + """ + Base class to perform indexing for courseware or library search from different modulestores + """ + + INDEX_NAME = None + ENABLE_INDEXING_KEY = None + + INDEX_EVENT = { + 'name': None, + 'category': None + } + + @classmethod + def indexing_is_enabled(cls): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get(cls.ENABLE_INDEXING_KEY, False) + + @classmethod + @abstractmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + + @classmethod + @abstractmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + + @classmethod + @abstractmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id + + @classmethod + def remove_deleted_items(cls, searcher, structure_key, exclude_items): + """ + remove any item that is present in the search index that is not present in updated list of indexed items + as we find items we can shorten the set of items to keep + """ + response = searcher.search( + field_dictionary=cls._get_location_info(structure_key), + exclude_dictionary={"id": list(exclude_items)} + ) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) + + @classmethod + def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE, timeout=INDEXING_REQUEST_TIMEOUT): # lint-amnesty, pylint: disable=line-too-long, too-many-statements + """ + Process course for indexing + + Arguments: + modulestore - modulestore object to use for operations + + structure_key (CourseKey|LibraryKey) - course or library identifier + + triggered_at (datetime) - provides time at which indexing was triggered; + useful for index updates - only things changed recently from that date + (within REINDEX_AGE above ^^) will have their index updated, others skip + updating their index but are still walked through in order to identify + which items may need to be removed from the index + If None, then a full reindex takes place + + Returns: + Number of items that have been added to the index + """ + error_list = [] + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + structure_key = cls.normalize_structure_key(structure_key) + location_info = cls._get_location_info(structure_key) + + # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index` + indexed_count = { + "count": 0 + } + + # indexed_items is a list of all the items that we wish to remain in the + # index, whether or not we are planning to actually update their index. + # This is used in order to build a query to remove those items not in this + # list - those are ready to be destroyed + indexed_items = set() + + # items_index is a list of all the items index dictionaries. + # it is used to collect all indexes and index them using bulk API, + # instead of per item index API call. + items_index = [] + + def get_item_location(item): + """ + Gets the version agnostic item location + """ + return item.location.version_agnostic().replace(branch=None) + + def prepare_item_index(item, skip_index=False, groups_usage_info=None): + """ + Add this item to the items_index and indexed_items list + + Arguments: + item - item to add to index, its children will be processed recursively + + skip_index - simply walk the children in the tree, the content change is + older than the REINDEX_AGE window and would have been already indexed. + This should really only be passed from the recursive child calls when + this method has determined that it is safe to do so + + Returns: + item_content_groups - content groups assigned to indexed item + """ + item_index_dictionary = item.index_dictionary() + # if it's not indexable and it does not have children, then ignore + if not item_index_dictionary and not item.has_children: + return + + item_content_groups = None + + if item.category == "split_test": # lint-amnesty, pylint: disable=too-many-nested-blocks + split_partition = item.get_selected_partition() + for split_test_child in item.get_children(): + if split_partition: + for group in split_partition.groups: + group_id = str(group.id) + child_location = item.group_id_to_child.get(group_id, None) + if child_location == split_test_child.location: + groups_usage_info.update({ + str(get_item_location(split_test_child)): [group_id], + }) + for component in split_test_child.get_children(): + groups_usage_info.update({ + str(get_item_location(component)): [group_id] + }) + + if groups_usage_info: + item_location = get_item_location(item) + item_content_groups = groups_usage_info.get(str(item_location), None) + + item_id = str(cls._id_modifier(item.scope_ids.usage_id)) + indexed_items.add(item_id) + if item.has_children: + # determine if it's okay to skip adding the children herein based upon how recently any may have changed + skip_child_index = skip_index or \ + (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) + children_groups_usage = [] + for child_item in item.get_children(): + if modulestore.has_published_version(child_item): + children_groups_usage.append( + prepare_item_index( + child_item, + skip_index=skip_child_index, + groups_usage_info=groups_usage_info + ) + ) + if None in children_groups_usage: + item_content_groups = None + + if skip_index or not item_index_dictionary: + return + + item_index = {} + # if it has something to add to the index, then add it + try: + item_index.update(location_info) + item_index.update(item_index_dictionary) + item_index['id'] = item_id + if item.start: + item_index['start_date'] = item.start + item_index['content_groups'] = item_content_groups if item_content_groups else None + item_index.update(cls.supplemental_fields(item)) + items_index.append(item_index) + indexed_count["count"] += 1 + return item_content_groups + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not fail on one item of many + log.warning('Could not index item: %s - %r', item.location, err) + error_list.append(_('Could not index item: {}').format(item.location)) + + try: + with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): + structure = cls._fetch_top_level(modulestore, structure_key) + groups_usage_info = cls.fetch_group_usage(modulestore, structure) + + # First perform any additional indexing from the structure object + cls.supplemental_index_information(modulestore, structure) + + # Now index the content + for item in structure.get_children(): + prepare_item_index(item, groups_usage_info=groups_usage_info) + searcher.index(items_index, request_timeout=timeout) + cls.remove_deleted_items(searcher, structure_key, indexed_items) + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not prevent the rest of the application from working + log.exception( + "Indexing error encountered, courseware index may be out of date %s - %r", + structure_key, + err + ) + error_list.append(_('General indexing error occurred')) + + if error_list: + raise SearchIndexingError('Error(s) present during indexing', error_list) + + return indexed_count["count"] + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index all content within the given structure (course or library), + tracking the fact that a full reindex has taken place + """ + indexed_count = cls.index(modulestore, structure_key) + if indexed_count: + cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count) + return indexed_count + + @classmethod + def _track_index_request(cls, event_name, category, indexed_count): + """Track content index requests. + + Arguments: + event_name (str): Name of the event to be logged. + category (str): category of indexed items + indexed_count (int): number of indexed items + Returns: + None + + """ + data = { + "indexed_count": indexed_count, + 'category': category, + } + + tracker.emit( + event_name, + data + ) + + @classmethod + def fetch_group_usage(cls, modulestore, structure): # pylint: disable=unused-argument + """ + Base implementation of fetch group usage on course/library. + """ + return None + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform any supplemental indexing given that the structure object has + already been loaded. Base implementation performs no operation. + + Arguments: + modulestore - modulestore object used during the indexing operation + structure - structure object loaded during the indexing job + + Returns: + None + """ + pass # lint-amnesty, pylint: disable=unnecessary-pass + + @classmethod + def supplemental_fields(cls, item): # pylint: disable=unused-argument + """ + Any supplemental fields that get added to the index for the specified + item. Base implementation returns an empty dictionary + """ + return {} + + +class CoursewareSearchIndexer(SearchIndexerBase): + """ + Class to perform indexing for courseware search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "courseware_content" + else: + INDEX_NAME = "courseware_content" + + ENABLE_INDEXING_KEY = 'ENABLE_COURSEWARE_INDEX' + + INDEX_EVENT = { + 'name': 'edx.course.index.reindexed', + 'category': 'courseware_index' + } + + UNNAMED_MODULE_NAME = gettext_lazy("(Unnamed)") + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return structure_key + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_course(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def do_course_reindex(cls, modulestore, course_key): + """ + (Re)index all content within the given course, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, course_key) + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index course content within the given structure. + + The course_info index is indexed with the courseware_content index. This method + helps to track the fact that course_info reindex has taken place. + """ + indexed_count = super()._do_reindex(modulestore, structure_key) + if indexed_count: + course_about = CourseAboutSearchIndexer + cls._track_index_request( + course_about.INDEX_EVENT['name'], + course_about.INDEX_EVENT['category'], + indexed_count + ) + return indexed_count + + @classmethod + def fetch_group_usage(cls, modulestore, structure): + groups_usage_dict = {} + partitions_info = GroupConfiguration.get_partitions_usage_info(modulestore, structure) + content_group_info = GroupConfiguration.get_content_groups_items_usage_info( + modulestore, + structure + ) + for group_info in (partitions_info, content_group_info): + for groups in group_info.values(): + for name, group in groups.items(): + for module in group: + view, args, kwargs = resolve(module['url']) # pylint: disable=unused-variable + usage_key_string = str(kwargs['usage_key_string']) + if groups_usage_dict.get(usage_key_string, None): + groups_usage_dict[usage_key_string].append(name) + else: + groups_usage_dict[usage_key_string] = [name] + return groups_usage_dict + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform additional indexing from loaded structure object + """ + CourseAboutSearchIndexer.index_about_information(modulestore, structure) + + @classmethod + def supplemental_fields(cls, item): + """ + Add location path to the item object + + Once we've established the path of names, the first name is the course + name, and the next 3 names are the navigable path within the edx + application. Notice that we stop at that level because a full path to + deep children would be confusing. + """ + location_path = [] + parent = item + while parent is not None: + path_component_name = parent.display_name + if not path_component_name: + path_component_name = str(cls.UNNAMED_MODULE_NAME) + location_path.append(path_component_name) + parent = parent.get_parent() + location_path.reverse() + return { + "course_name": location_path[0], + "location": location_path[1:4] + } + + +class LibrarySearchIndexer(SearchIndexerBase): + """ + Base class to perform indexing for library search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "library_index" + else: + INDEX_NAME = "library_index" + + ENABLE_INDEXING_KEY = 'ENABLE_LIBRARY_INDEX' + + INDEX_EVENT = { + 'name': 'edx.library.index.reindexed', + 'category': 'library_index' + } + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return normalize_key_for_search(structure_key) + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_library(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"library": str(normalized_structure_key)} + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None))) + + @classmethod + def do_library_reindex(cls, modulestore, library_key): + """ + (Re)index all content within the given library, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, library_key) + + +class AboutInfo: + """ About info structure to contain + 1) Property name to use + 2) Where to add in the index (using flags above) + 3) Where to source the properties value + """ + # Bitwise Flags for where to index the information + # + # ANALYSE - states that the property text contains content that we wish to be able to find matched within + # e.g. "joe" should yield a result for "I'd like to drink a cup of joe" + # + # PROPERTY - states that the property text should be a property of the indexed document, to be returned with the + # results: search matches will only be made on exact string matches + # e.g. "joe" will only match on "joe" + # + # We are using bitwise flags because one may want to add the property to EITHER or BOTH parts of the index + # e.g. university name is desired to be analysed, so that a search on "Oxford" will match + # property values "University of Oxford" and "Oxford Brookes University", + # but it is also a useful property, because within a (future) filtered search a user + # may have chosen to filter courses from "University of Oxford" + # + # see https://wiki.python.org/moin/BitwiseOperators for information about bitwise shift operator used below + # + ANALYSE = 1 << 0 # Add the information to the analysed content of the index + PROPERTY = 1 << 1 # Add the information as a property of the object being indexed (not analysed) + + def __init__(self, property_name, index_flags, source_from): + self.property_name = property_name + self.index_flags = index_flags + self.source_from = source_from + + def get_value(self, **kwargs): + """ get the value for this piece of information, using the correct source """ + return self.source_from(self, **kwargs) + + def from_about_dictionary(self, **kwargs): + """ gets the value from the kwargs provided 'about_dictionary' """ + about_dictionary = kwargs.get('about_dictionary', None) + if not about_dictionary: + raise ValueError("Context dictionary does not contain expected argument 'about_dictionary'") + + return about_dictionary.get(self.property_name, None) + + def from_course_property(self, **kwargs): + """ gets the value from the kwargs provided 'course' """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return getattr(course, self.property_name, None) + + def from_course_mode(self, **kwargs): + """ fetches the available course modes from the CourseMode model """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return [mode.slug for mode in CourseMode.modes_for_course(course.id)] + + # Source location options - either from the course or the about info + FROM_ABOUT_INFO = from_about_dictionary + FROM_COURSE_PROPERTY = from_course_property + FROM_COURSE_MODE = from_course_mode + + +class CourseAboutSearchIndexer(CoursewareSearchIndexer): + """ + Class to perform indexing of about information from course object + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "course_info" + else: + INDEX_NAME = "course_info" + + INDEX_EVENT = { + 'name': 'edx.course_info.index.reindexed', + 'category': 'course_info' + } + + # List of properties to add to the index - each item in the list is an instance of AboutInfo object + ABOUT_INFORMATION_TO_INCLUDE = [ + AboutInfo("advertised_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("announcement", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("effort", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("display_name", AboutInfo.ANALYSE, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("overview", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("title", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("university", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("number", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("short_description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("key_dates", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("video", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_short", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_extended", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("requirements", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("syllabus", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("textbook", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("faq", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("more_info", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("ocw_links", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("enrollment_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("enrollment_end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("org", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("modes", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_MODE), + AboutInfo("language", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("invitation_only", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("catalog_visibility", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + ] + + @classmethod + def index_about_information(cls, modulestore, course): + """ + Add the given course to the course discovery index + + Arguments: + modulestore - modulestore object to use for operations + + course - course object from which to take properties, locate about information + """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + course_id = str(course.id) + course_info = { + 'id': course_id, + 'course': course_id, + 'content': {}, + 'image_url': course_image_url(course), + } + + # load data for all of the 'about' modules for this course into a dictionary + about_dictionary = { + item.location.block_id: item.data + for item in modulestore.get_items(course.id, qualifiers={"category": "about"}) + } + + about_context = { + "course": course, + "about_dictionary": about_dictionary, + } + + for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE: + # Broad exception handler so that a single bad property does not scupper the collection of others + try: + section_content = about_information.get_value(**about_context) + except: # pylint: disable=bare-except + section_content = None + log.warning( + "Course discovery could not collect property %s for course %s", + about_information.property_name, + course_id, + exc_info=True, + ) + + if section_content: + if about_information.index_flags & AboutInfo.ANALYSE: + analyse_content = section_content + if isinstance(section_content, str): + analyse_content = strip_html_content_to_text(section_content) + course_info['content'][about_information.property_name] = analyse_content + if about_information.index_flags & AboutInfo.PROPERTY: + course_info[about_information.property_name] = section_content + + # Broad exception handler to protect around and report problems with indexing + try: + searcher.index([course_info]) + except: + log.exception( + "Course discovery indexing error encountered, course discovery index may be out of date %s", + course_id, + ) + raise + + log.debug( + "Successfully added %s course to the course discovery index", + course_id + ) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def remove_deleted_items(cls, structure_key): # lint-amnesty, pylint: disable=arguments-differ + """ Remove item from Course About Search_index """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + response = searcher.search(field_dictionary=cls._get_location_info(structure_key)) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) \ No newline at end of file diff --git a/docker-definition/openedx_customizations/lilac/cms/djangoapps/contentstore/courseware_index.py b/docker-definition/openedx_customizations/lilac/cms/djangoapps/contentstore/courseware_index.py new file mode 100644 index 000000000..7574d6223 --- /dev/null +++ b/docker-definition/openedx_customizations/lilac/cms/djangoapps/contentstore/courseware_index.py @@ -0,0 +1,690 @@ +""" Code to allow module store to interface with courseware index """ + +import logging +import os +import re +from abc import ABCMeta, abstractmethod +from datetime import timedelta + +from django.conf import settings +from django.urls import resolve +from django.utils.translation import gettext as _ +from django.utils.translation import gettext_lazy +from eventtracking import tracker +from search.search_engine_base import SearchEngine + +from cms.djangoapps.contentstore.course_group_config import GroupConfiguration +from common.djangoapps.course_modes.models import CourseMode +from openedx.core.lib.courses import course_image_url +from xmodule.annotator_mixin import html_to_text # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.library_tools import normalize_key_for_search # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.modulestore import ModuleStoreEnum # lint-amnesty, pylint: disable=wrong-import-order + +# REINDEX_AGE is the default amount of time that we look back for changes +# that might have happened. If we are provided with a time at which the +# indexing is triggered, then we know it is safe to only index items +# recently changed at that time. This is the time period that represents +# how far back from the trigger point to look back in order to index +REINDEX_AGE = timedelta(0, 60) # 60 seconds + +# INDEXING_REQUEST_TIMEOUT is the number of seconds before a request is considered +# timed out for courseware indexing. +INDEXING_REQUEST_TIMEOUT = 60 + +log = logging.getLogger('edx.modulestore') + + +def strip_html_content_to_text(html_content): + """ Gets only the textual part for html content - useful for building text to be searched """ + # Removing HTML-encoded non-breaking space characters + text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content)) + # Removing HTML CDATA + text_content = re.sub(r"", "", text_content) + # Removing HTML comments + text_content = re.sub(r"", "", text_content) + + return text_content + + +def indexing_is_enabled(): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get('ENABLE_COURSEWARE_INDEX', False) + + +class SearchIndexingError(Exception): + """ Indicates some error(s) occured during indexing """ + + def __init__(self, message, error_list): + super().__init__(message) + self.error_list = error_list + + +class SearchIndexerBase(metaclass=ABCMeta): + """ + Base class to perform indexing for courseware or library search from different modulestores + """ + + INDEX_NAME = None + ENABLE_INDEXING_KEY = None + + INDEX_EVENT = { + 'name': None, + 'category': None + } + + @classmethod + def indexing_is_enabled(cls): + """ + Checks to see if the indexing feature is enabled + """ + return settings.FEATURES.get(cls.ENABLE_INDEXING_KEY, False) + + @classmethod + @abstractmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + + @classmethod + @abstractmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + + @classmethod + @abstractmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id + + @classmethod + def remove_deleted_items(cls, searcher, structure_key, exclude_items): + """ + remove any item that is present in the search index that is not present in updated list of indexed items + as we find items we can shorten the set of items to keep + """ + response = searcher.search( + field_dictionary=cls._get_location_info(structure_key), + exclude_dictionary={"id": list(exclude_items)} + ) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) + + @classmethod + def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE, timeout=INDEXING_REQUEST_TIMEOUT): # lint-amnesty, pylint: disable=line-too-long, too-many-statements + """ + Process course for indexing + + Arguments: + modulestore - modulestore object to use for operations + + structure_key (CourseKey|LibraryKey) - course or library identifier + + triggered_at (datetime) - provides time at which indexing was triggered; + useful for index updates - only things changed recently from that date + (within REINDEX_AGE above ^^) will have their index updated, others skip + updating their index but are still walked through in order to identify + which items may need to be removed from the index + If None, then a full reindex takes place + + Returns: + Number of items that have been added to the index + """ + error_list = [] + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + structure_key = cls.normalize_structure_key(structure_key) + location_info = cls._get_location_info(structure_key) + + # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index` + indexed_count = { + "count": 0 + } + + # indexed_items is a list of all the items that we wish to remain in the + # index, whether or not we are planning to actually update their index. + # This is used in order to build a query to remove those items not in this + # list - those are ready to be destroyed + indexed_items = set() + + # items_index is a list of all the items index dictionaries. + # it is used to collect all indexes and index them using bulk API, + # instead of per item index API call. + items_index = [] + + def get_item_location(item): + """ + Gets the version agnostic item location + """ + return item.location.version_agnostic().replace(branch=None) + + def prepare_item_index(item, skip_index=False, groups_usage_info=None): + """ + Add this item to the items_index and indexed_items list + + Arguments: + item - item to add to index, its children will be processed recursively + + skip_index - simply walk the children in the tree, the content change is + older than the REINDEX_AGE window and would have been already indexed. + This should really only be passed from the recursive child calls when + this method has determined that it is safe to do so + + Returns: + item_content_groups - content groups assigned to indexed item + """ + item_index_dictionary = item.index_dictionary() + # if it's not indexable and it does not have children, then ignore + if not item_index_dictionary and not item.has_children: + return + + item_content_groups = None + + if item.category == "split_test": # lint-amnesty, pylint: disable=too-many-nested-blocks + split_partition = item.get_selected_partition() + for split_test_child in item.get_children(): + if split_partition: + for group in split_partition.groups: + group_id = str(group.id) + child_location = item.group_id_to_child.get(group_id, None) + if child_location == split_test_child.location: + groups_usage_info.update({ + str(get_item_location(split_test_child)): [group_id], + }) + for component in split_test_child.get_children(): + groups_usage_info.update({ + str(get_item_location(component)): [group_id] + }) + + if groups_usage_info: + item_location = get_item_location(item) + item_content_groups = groups_usage_info.get(str(item_location), None) + + item_id = str(cls._id_modifier(item.scope_ids.usage_id)) + indexed_items.add(item_id) + if item.has_children: + # determine if it's okay to skip adding the children herein based upon how recently any may have changed + skip_child_index = skip_index or \ + (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) + children_groups_usage = [] + for child_item in item.get_children(): + if modulestore.has_published_version(child_item): + children_groups_usage.append( + prepare_item_index( + child_item, + skip_index=skip_child_index, + groups_usage_info=groups_usage_info + ) + ) + if None in children_groups_usage: + item_content_groups = None + + if skip_index or not item_index_dictionary: + return + + item_index = {} + # if it has something to add to the index, then add it + try: + item_index.update(location_info) + item_index.update(item_index_dictionary) + item_index['id'] = item_id + if item.start: + item_index['start_date'] = item.start + item_index['content_groups'] = item_content_groups if item_content_groups else None + item_index.update(cls.supplemental_fields(item)) + items_index.append(item_index) + indexed_count["count"] += 1 + return item_content_groups + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not fail on one item of many + log.warning('Could not index item: %s - %r', item.location, err) + error_list.append(_('Could not index item: {}').format(item.location)) + + try: + with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): + structure = cls._fetch_top_level(modulestore, structure_key) + groups_usage_info = cls.fetch_group_usage(modulestore, structure) + + # First perform any additional indexing from the structure object + cls.supplemental_index_information(modulestore, structure) + + # Now index the content + for item in structure.get_children(): + prepare_item_index(item, groups_usage_info=groups_usage_info) + searcher.index(items_index, request_timeout=timeout) + cls.remove_deleted_items(searcher, structure_key, indexed_items) + except Exception as err: # pylint: disable=broad-except + # broad exception so that index operation does not prevent the rest of the application from working + log.exception( + "Indexing error encountered, courseware index may be out of date %s - %r", + structure_key, + err + ) + error_list.append(_('General indexing error occurred')) + + if error_list: + raise SearchIndexingError('Error(s) present during indexing', error_list) + + return indexed_count["count"] + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index all content within the given structure (course or library), + tracking the fact that a full reindex has taken place + """ + indexed_count = cls.index(modulestore, structure_key) + if indexed_count: + cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count) + return indexed_count + + @classmethod + def _track_index_request(cls, event_name, category, indexed_count): + """Track content index requests. + + Arguments: + event_name (str): Name of the event to be logged. + category (str): category of indexed items + indexed_count (int): number of indexed items + Returns: + None + + """ + data = { + "indexed_count": indexed_count, + 'category': category, + } + + tracker.emit( + event_name, + data + ) + + @classmethod + def fetch_group_usage(cls, modulestore, structure): # pylint: disable=unused-argument + """ + Base implementation of fetch group usage on course/library. + """ + return None + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform any supplemental indexing given that the structure object has + already been loaded. Base implementation performs no operation. + + Arguments: + modulestore - modulestore object used during the indexing operation + structure - structure object loaded during the indexing job + + Returns: + None + """ + pass # lint-amnesty, pylint: disable=unnecessary-pass + + @classmethod + def supplemental_fields(cls, item): # pylint: disable=unused-argument + """ + Any supplemental fields that get added to the index for the specified + item. Base implementation returns an empty dictionary + """ + return {} + + +class CoursewareSearchIndexer(SearchIndexerBase): + """ + Class to perform indexing for courseware search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "courseware_content" + else: + INDEX_NAME = "courseware_content" + + ENABLE_INDEXING_KEY = 'ENABLE_COURSEWARE_INDEX' + + INDEX_EVENT = { + 'name': 'edx.course.index.reindexed', + 'category': 'courseware_index' + } + + UNNAMED_MODULE_NAME = gettext_lazy("(Unnamed)") + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return structure_key + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_course(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def do_course_reindex(cls, modulestore, course_key): + """ + (Re)index all content within the given course, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, course_key) + + @classmethod + def _do_reindex(cls, modulestore, structure_key): + """ + (Re)index course content within the given structure. + + The course_info index is indexed with the courseware_content index. This method + helps to track the fact that course_info reindex has taken place. + """ + indexed_count = super()._do_reindex(modulestore, structure_key) + if indexed_count: + course_about = CourseAboutSearchIndexer + cls._track_index_request( + course_about.INDEX_EVENT['name'], + course_about.INDEX_EVENT['category'], + indexed_count + ) + return indexed_count + + @classmethod + def fetch_group_usage(cls, modulestore, structure): + groups_usage_dict = {} + partitions_info = GroupConfiguration.get_partitions_usage_info(modulestore, structure) + content_group_info = GroupConfiguration.get_content_groups_items_usage_info( + modulestore, + structure + ) + for group_info in (partitions_info, content_group_info): + for groups in group_info.values(): + for name, group in groups.items(): + for module in group: + view, args, kwargs = resolve(module['url']) # pylint: disable=unused-variable + usage_key_string = str(kwargs['usage_key_string']) + if groups_usage_dict.get(usage_key_string, None): + groups_usage_dict[usage_key_string].append(name) + else: + groups_usage_dict[usage_key_string] = [name] + return groups_usage_dict + + @classmethod + def supplemental_index_information(cls, modulestore, structure): + """ + Perform additional indexing from loaded structure object + """ + CourseAboutSearchIndexer.index_about_information(modulestore, structure) + + @classmethod + def supplemental_fields(cls, item): + """ + Add location path to the item object + + Once we've established the path of names, the first name is the course + name, and the next 3 names are the navigable path within the edx + application. Notice that we stop at that level because a full path to + deep children would be confusing. + """ + location_path = [] + parent = item + while parent is not None: + path_component_name = parent.display_name + if not path_component_name: + path_component_name = str(cls.UNNAMED_MODULE_NAME) + location_path.append(path_component_name) + parent = parent.get_parent() + location_path.reverse() + return { + "course_name": location_path[0], + "location": location_path[1:4] + } + + +class LibrarySearchIndexer(SearchIndexerBase): + """ + Base class to perform indexing for library search from different modulestores + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "library_index" + else: + INDEX_NAME = "library_index" + + ENABLE_INDEXING_KEY = 'ENABLE_LIBRARY_INDEX' + + INDEX_EVENT = { + 'name': 'edx.library.index.reindexed', + 'category': 'library_index' + } + + @classmethod + def normalize_structure_key(cls, structure_key): + """ Normalizes structure key for use in indexing """ + return normalize_key_for_search(structure_key) + + @classmethod + def _fetch_top_level(cls, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_library(structure_key, depth=None) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"library": str(normalized_structure_key)} + + @classmethod + def _id_modifier(cls, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None))) + + @classmethod + def do_library_reindex(cls, modulestore, library_key): + """ + (Re)index all content within the given library, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, library_key) + + +class AboutInfo: + """ About info structure to contain + 1) Property name to use + 2) Where to add in the index (using flags above) + 3) Where to source the properties value + """ + # Bitwise Flags for where to index the information + # + # ANALYSE - states that the property text contains content that we wish to be able to find matched within + # e.g. "joe" should yield a result for "I'd like to drink a cup of joe" + # + # PROPERTY - states that the property text should be a property of the indexed document, to be returned with the + # results: search matches will only be made on exact string matches + # e.g. "joe" will only match on "joe" + # + # We are using bitwise flags because one may want to add the property to EITHER or BOTH parts of the index + # e.g. university name is desired to be analysed, so that a search on "Oxford" will match + # property values "University of Oxford" and "Oxford Brookes University", + # but it is also a useful property, because within a (future) filtered search a user + # may have chosen to filter courses from "University of Oxford" + # + # see https://wiki.python.org/moin/BitwiseOperators for information about bitwise shift operator used below + # + ANALYSE = 1 << 0 # Add the information to the analysed content of the index + PROPERTY = 1 << 1 # Add the information as a property of the object being indexed (not analysed) + + def __init__(self, property_name, index_flags, source_from): + self.property_name = property_name + self.index_flags = index_flags + self.source_from = source_from + + def get_value(self, **kwargs): + """ get the value for this piece of information, using the correct source """ + return self.source_from(self, **kwargs) + + def from_about_dictionary(self, **kwargs): + """ gets the value from the kwargs provided 'about_dictionary' """ + about_dictionary = kwargs.get('about_dictionary', None) + if not about_dictionary: + raise ValueError("Context dictionary does not contain expected argument 'about_dictionary'") + + return about_dictionary.get(self.property_name, None) + + def from_course_property(self, **kwargs): + """ gets the value from the kwargs provided 'course' """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return getattr(course, self.property_name, None) + + def from_course_mode(self, **kwargs): + """ fetches the available course modes from the CourseMode model """ + course = kwargs.get('course', None) + if not course: + raise ValueError("Context dictionary does not contain expected argument 'course'") + + return [mode.slug for mode in CourseMode.modes_for_course(course.id)] + + # Source location options - either from the course or the about info + FROM_ABOUT_INFO = from_about_dictionary + FROM_COURSE_PROPERTY = from_course_property + FROM_COURSE_MODE = from_course_mode + + +class CourseAboutSearchIndexer(CoursewareSearchIndexer): + """ + Class to perform indexing of about information from course object + """ + if os.environ["ELASTICSEARCH_PREFIX"]: + INDEX_NAME = os.environ["ELASTICSEARCH_PREFIX"] + "_" + "course_info" + else: + INDEX_NAME = "course_info" + + INDEX_EVENT = { + 'name': 'edx.course_info.index.reindexed', + 'category': 'course_info' + } + + # List of properties to add to the index - each item in the list is an instance of AboutInfo object + ABOUT_INFORMATION_TO_INCLUDE = [ + AboutInfo("advertised_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("announcement", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("effort", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("display_name", AboutInfo.ANALYSE, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("overview", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("title", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("university", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("number", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("short_description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("key_dates", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("video", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_short", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("course_staff_extended", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("requirements", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("syllabus", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("textbook", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("faq", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("more_info", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("ocw_links", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO), + AboutInfo("enrollment_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("enrollment_end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("org", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("modes", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_MODE), + AboutInfo("language", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("invitation_only", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + AboutInfo("catalog_visibility", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY), + ] + + @classmethod + def index_about_information(cls, modulestore, course): + """ + Add the given course to the course discovery index + + Arguments: + modulestore - modulestore object to use for operations + + course - course object from which to take properties, locate about information + """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + course_id = str(course.id) + course_info = { + 'id': course_id, + 'course': course_id, + 'content': {}, + 'image_url': course_image_url(course), + } + + # load data for all of the 'about' modules for this course into a dictionary + about_dictionary = { + item.location.block_id: item.data + for item in modulestore.get_items(course.id, qualifiers={"category": "about"}) + } + + about_context = { + "course": course, + "about_dictionary": about_dictionary, + } + + for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE: + # Broad exception handler so that a single bad property does not scupper the collection of others + try: + section_content = about_information.get_value(**about_context) + except: # pylint: disable=bare-except + section_content = None + log.warning( + "Course discovery could not collect property %s for course %s", + about_information.property_name, + course_id, + exc_info=True, + ) + + if section_content: + if about_information.index_flags & AboutInfo.ANALYSE: + analyse_content = section_content + if isinstance(section_content, str): + analyse_content = strip_html_content_to_text(section_content) + course_info['content'][about_information.property_name] = analyse_content + if about_information.index_flags & AboutInfo.PROPERTY: + course_info[about_information.property_name] = section_content + + # Broad exception handler to protect around and report problems with indexing + try: + searcher.index([course_info]) + except: + log.exception( + "Course discovery indexing error encountered, course discovery index may be out of date %s", + course_id, + ) + raise + + log.debug( + "Successfully added %s course to the course discovery index", + course_id + ) + + @classmethod + def _get_location_info(cls, normalized_structure_key): + """ Builds location info dictionary """ + return {"course": str(normalized_structure_key), "org": normalized_structure_key.org} + + @classmethod + def remove_deleted_items(cls, structure_key): # lint-amnesty, pylint: disable=arguments-differ + """ Remove item from Course About Search_index """ + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) + if not searcher: + return + + response = searcher.search(field_dictionary=cls._get_location_info(structure_key)) + result_ids = [result["data"]["id"] for result in response["results"]] + searcher.remove(result_ids) \ No newline at end of file