diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py index 846d6146..25b5a21f 100644 --- a/copy_text/text_bot.py +++ b/copy_text/text_bot.py @@ -3,16 +3,13 @@ """ import re +from functools import lru_cache from apis import mdwiki_api from copy_to_en.bots import text_changes # text = text_changes.work(text) from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext) from mdapi_sql import sql_for_mdwiki -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() full_translate = sql_for_mdwiki.select_md_sql("select DISTINCT tt_title from translate_type where tt_full = 1;", return_dict=True) @@ -40,16 +37,19 @@ def get_cats(alltext): return cats_text +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid -def get_un_wb_tag(alltext, x): +@lru_cache(maxsize=128) +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -57,13 +57,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase @@ -96,7 +89,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- diff --git a/copy_to_en/mdwikicx.py b/copy_to_en/mdwikicx.py index 856d671b..11a52b62 100644 --- a/copy_to_en/mdwikicx.py +++ b/copy_to_en/mdwikicx.py @@ -12,6 +12,7 @@ import json import sys import re +from functools import lru_cache from pathlib import Path from multiprocessing import Pool from apis import cat_cach @@ -27,10 +28,6 @@ Dir = Path(__file__).parent -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0} # print(mdwiki_cats) @@ -66,16 +63,19 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"): return cat_members +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid -def get_un_wb_tag(alltext, x): +@lru_cache(maxsize=128) +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -83,13 +83,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase @@ -115,7 +108,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- @@ -147,10 +140,10 @@ def one_page(x): if new_title.find("/") != -1: new_title_all = f"Md:{x}/fulltext" # --- - alltext = text_cache.get(x) + alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = un_wb_tag_cache.get(x, "") + unlinked_tag = get_un_wb_tag(x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/copy_to_en/medwiki.py b/copy_to_en/medwiki.py index fd2c0f61..fc4d13d7 100644 --- a/copy_to_en/medwiki.py +++ b/copy_to_en/medwiki.py @@ -14,6 +14,7 @@ import json import sys import re +from functools import lru_cache from pathlib import Path from multiprocessing import Pool from apis import cat_cach @@ -29,10 +30,6 @@ Dir = Path(__file__).parent -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0} # print(mdwiki_cats) @@ -115,16 +112,19 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"): return cat_members +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid -def get_un_wb_tag(alltext, x): +@lru_cache(maxsize=128) +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -132,13 +132,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase @@ -167,7 +160,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- @@ -199,10 +192,10 @@ def one_page(x): if new_title.find("/") != -1: new_title_all = f"Md:{x}/fulltext" # --- - alltext = text_cache.get(x) + alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = un_wb_tag_cache.get(x, "") + unlinked_tag = get_un_wb_tag(x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/wprefs/bots/replace_except.py b/wprefs/bots/replace_except.py index c33aea7f..0e7bd090 100644 --- a/wprefs/bots/replace_except.py +++ b/wprefs/bots/replace_except.py @@ -1,8 +1,7 @@ import re import sys from contextlib import suppress - -_regex_cache = {} +from functools import lru_cache NESTED_TEMPLATE_REGEX = re.compile( r""" @@ -102,54 +101,44 @@ def _tag_pattern(tag_name: str) -> str: return r'<{0}(?:>|\s+[^>]*(?)' r'[\s\S]*?' r''.format(ignore_case(tag_name)) # start tag # contents # end tag -def _create_default_regexes() -> None: - """Fill (and possibly overwrite) _regex_cache with default regexes.""" - _regex_cache.update( - { - # categories - 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])), - 'comment': re.compile(r''), - # files - 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), - # section headers - 'header': re.compile(r'(?:(?<=\n)|\A)(?:)*' r'=(?:[^\n]|)+=' r' *(?: *)*(?=\n|\Z)'), - # external links - 'hyperlink': compileLinkR(), - # also finds links to foreign sites with preleading ":" - 'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), - # Module invocations (currently only Lua) - 'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))), - # this matches internal wikilinks, but also interwiki, categories, and - # images. - 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'), - # pagelist tag (used in Proofread extension). - 'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))), - # Wikibase property inclusions - 'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))), - # lines that start with a colon or more will be indented - 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'), - # lines that start with a space are shown in a monospace font and - # have whitespace preserved. - 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'), - # tables often have whitespace that is used to improve wiki - # source code readability. - # TODO: handle nested tables. - 'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')), - 'template': NESTED_TEMPLATE_REGEX, - } - ) - - +# Default regex patterns - static patterns that don't require site-specific data +_DEFAULT_REGEXES = { + 'comment': re.compile(r''), + 'header': re.compile(r'(?:(?<=\n)|\A)(?:)*' r'=(?:[^\n]|)+=' r' *(?: *)*(?=\n|\Z)'), + 'hyperlink': compileLinkR(), + 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'), + 'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))), + 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'), + 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'), + 'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')), + 'template': NESTED_TEMPLATE_REGEX, +} + +# Patterns that require site-specific data (pattern, site_func) +_SITE_SPECIFIC_PATTERNS = { + 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])), + 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), + 'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), + 'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))), + 'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))), +} + + +@lru_cache(maxsize=128) def _tag_regex(tag_name: str): """Return a compiled tag regex for the given tag name.""" return re.compile(_tag_pattern(tag_name)) +@lru_cache(maxsize=128) +def _compile_site_regex(exc: str, site_data: str): + """Compile a site-specific regex pattern with the given site data.""" + re_text, _ = _SITE_SPECIFIC_PATTERNS[exc] + return re.compile(re_text % site_data, re.VERBOSE) + + def _get_regexes(keys, site): """Fetch compiled regexes.""" - if not _regex_cache: - _create_default_regexes() - result = [] for exc in keys: @@ -158,25 +147,21 @@ def _get_regexes(keys, site): result.append(exc) continue - # assume the string is a reference to a standard regex above, - # which may not yet have a site specific re compiled. - if exc in _regex_cache: - if isinstance(_regex_cache[exc], tuple): - if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'): - raise ValueError(f"Site cannot be None for the '{exc}' regex") - - if (exc, site) not in _regex_cache: - re_text, re_var = _regex_cache[exc] - _regex_cache[(exc, site)] = re.compile(re_text % re_var(site), re.VERBOSE) - - result.append(_regex_cache[(exc, site)]) - else: - result.append(_regex_cache[exc]) + # Check if it's a default static regex + if exc in _DEFAULT_REGEXES: + result.append(_DEFAULT_REGEXES[exc]) + # Check if it's a site-specific pattern + elif exc in _SITE_SPECIFIC_PATTERNS: + if not site: + raise ValueError(f"Site cannot be None for the '{exc}' regex") + # Extract site-specific data and use it as cache key + _, site_func = _SITE_SPECIFIC_PATTERNS[exc] + site_data = site_func(site) + result.append(_compile_site_regex(exc, site_data)) else: # nowiki, noinclude, includeonly, timeline, math and other - # extensions - _regex_cache[exc] = _tag_regex(exc) - result.append(_regex_cache[exc]) + # extensions - use cached tag regex + result.append(_tag_regex(exc)) # handle aliases if exc == 'source':