From b9a07428b0291949ab2a511abcf5e58cbb434f1b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 10:41:35 +0000 Subject: [PATCH 1/3] Initial plan From 84257d601947478c780f5c30d4e4abae32316d20 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 10:49:15 +0000 Subject: [PATCH 2/3] Replace local cache dictionaries with functools lru_cache Co-authored-by: MrIbrahem <26301308+MrIbrahem@users.noreply.github.com> --- copy_text/text_bot.py | 17 +----- copy_to_en/mdwikicx.py | 21 ++----- copy_to_en/medwiki.py | 21 ++----- wprefs/bots/replace_except.py | 103 ++++++++++++++-------------------- 4 files changed, 56 insertions(+), 106 deletions(-) diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py index 846d6146..e82fdeb3 100644 --- a/copy_text/text_bot.py +++ b/copy_text/text_bot.py @@ -3,16 +3,13 @@ """ import re +from functools import lru_cache from apis import mdwiki_api from copy_to_en.bots import text_changes # text = text_changes.work(text) from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext) from mdapi_sql import sql_for_mdwiki -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() full_translate = sql_for_mdwiki.select_md_sql("select DISTINCT tt_title from translate_type where tt_full = 1;", return_dict=True) @@ -40,15 +37,14 @@ def get_cats(alltext): return cats_text +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid +@lru_cache(maxsize=128) def get_un_wb_tag(alltext, x): # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" @@ -57,13 +53,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase diff --git a/copy_to_en/mdwikicx.py b/copy_to_en/mdwikicx.py index 856d671b..7b4d9dd8 100644 --- a/copy_to_en/mdwikicx.py +++ b/copy_to_en/mdwikicx.py @@ -12,6 +12,7 @@ import json import sys import re +from functools import lru_cache from pathlib import Path from multiprocessing import Pool from apis import cat_cach @@ -27,10 +28,6 @@ Dir = Path(__file__).parent -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0} # print(mdwiki_cats) @@ -66,15 +63,14 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"): return cat_members +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid +@lru_cache(maxsize=128) def get_un_wb_tag(alltext, x): # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" @@ -83,13 +79,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase @@ -147,10 +136,10 @@ def one_page(x): if new_title.find("/") != -1: new_title_all = f"Md:{x}/fulltext" # --- - alltext = text_cache.get(x) + alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = un_wb_tag_cache.get(x, "") + unlinked_tag = get_un_wb_tag(alltext, x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/copy_to_en/medwiki.py b/copy_to_en/medwiki.py index fd2c0f61..ee6843cb 100644 --- a/copy_to_en/medwiki.py +++ b/copy_to_en/medwiki.py @@ -14,6 +14,7 @@ import json import sys import re +from functools import lru_cache from pathlib import Path from multiprocessing import Pool from apis import cat_cach @@ -29,10 +30,6 @@ Dir = Path(__file__).parent -text_cache = {} -revid_cache = {} -un_wb_tag_cache = {} - mdwiki_cats = sql_for_mdwiki.get_db_categories() # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0} # print(mdwiki_cats) @@ -115,15 +112,14 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"): return cat_members +@lru_cache(maxsize=128) def get_text_revid(x): alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) # --- - text_cache[x] = alltext - revid_cache[x] = revid - # --- return alltext, revid +@lru_cache(maxsize=128) def get_un_wb_tag(alltext, x): # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" @@ -132,13 +128,6 @@ def get_un_wb_tag(alltext, x): # --- unlinkedwikibase = match.group(0) if match else "" # --- - # matches = re.findall(pattern, alltext) - # for m in matches: - # unlinkedwikibase = m - # break - # --- - un_wb_tag_cache[x] = unlinkedwikibase - # --- return unlinkedwikibase @@ -199,10 +188,10 @@ def one_page(x): if new_title.find("/") != -1: new_title_all = f"Md:{x}/fulltext" # --- - alltext = text_cache.get(x) + alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = un_wb_tag_cache.get(x, "") + unlinked_tag = get_un_wb_tag(alltext, x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/wprefs/bots/replace_except.py b/wprefs/bots/replace_except.py index c33aea7f..675453b5 100644 --- a/wprefs/bots/replace_except.py +++ b/wprefs/bots/replace_except.py @@ -1,8 +1,7 @@ import re import sys from contextlib import suppress - -_regex_cache = {} +from functools import lru_cache NESTED_TEMPLATE_REGEX = re.compile( r""" @@ -102,54 +101,44 @@ def _tag_pattern(tag_name: str) -> str: return r'<{0}(?:>|\s+[^>]*(?)' r'[\s\S]*?' r''.format(ignore_case(tag_name)) # start tag # contents # end tag -def _create_default_regexes() -> None: - """Fill (and possibly overwrite) _regex_cache with default regexes.""" - _regex_cache.update( - { - # categories - 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])), - 'comment': re.compile(r''), - # files - 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), - # section headers - 'header': re.compile(r'(?:(?<=\n)|\A)(?:)*' r'=(?:[^\n]|)+=' r' *(?: *)*(?=\n|\Z)'), - # external links - 'hyperlink': compileLinkR(), - # also finds links to foreign sites with preleading ":" - 'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), - # Module invocations (currently only Lua) - 'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))), - # this matches internal wikilinks, but also interwiki, categories, and - # images. - 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'), - # pagelist tag (used in Proofread extension). - 'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))), - # Wikibase property inclusions - 'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))), - # lines that start with a colon or more will be indented - 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'), - # lines that start with a space are shown in a monospace font and - # have whitespace preserved. - 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'), - # tables often have whitespace that is used to improve wiki - # source code readability. - # TODO: handle nested tables. - 'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')), - 'template': NESTED_TEMPLATE_REGEX, - } - ) - - +# Default regex patterns - static patterns that don't require site-specific data +_DEFAULT_REGEXES = { + 'comment': re.compile(r''), + 'header': re.compile(r'(?:(?<=\n)|\A)(?:)*' r'=(?:[^\n]|)+=' r' *(?: *)*(?=\n|\Z)'), + 'hyperlink': compileLinkR(), + 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'), + 'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))), + 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'), + 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'), + 'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')), + 'template': NESTED_TEMPLATE_REGEX, +} + +# Patterns that require site-specific data (pattern, site_func) +_SITE_SPECIFIC_PATTERNS = { + 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])), + 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), + 'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), + 'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))), + 'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))), +} + + +@lru_cache(maxsize=128) def _tag_regex(tag_name: str): """Return a compiled tag regex for the given tag name.""" return re.compile(_tag_pattern(tag_name)) +@lru_cache(maxsize=128) +def _compile_site_regex(exc: str, site_repr: str, site): + """Compile a site-specific regex pattern.""" + re_text, re_var = _SITE_SPECIFIC_PATTERNS[exc] + return re.compile(re_text % re_var(site), re.VERBOSE) + + def _get_regexes(keys, site): """Fetch compiled regexes.""" - if not _regex_cache: - _create_default_regexes() - result = [] for exc in keys: @@ -158,25 +147,19 @@ def _get_regexes(keys, site): result.append(exc) continue - # assume the string is a reference to a standard regex above, - # which may not yet have a site specific re compiled. - if exc in _regex_cache: - if isinstance(_regex_cache[exc], tuple): - if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'): - raise ValueError(f"Site cannot be None for the '{exc}' regex") - - if (exc, site) not in _regex_cache: - re_text, re_var = _regex_cache[exc] - _regex_cache[(exc, site)] = re.compile(re_text % re_var(site), re.VERBOSE) - - result.append(_regex_cache[(exc, site)]) - else: - result.append(_regex_cache[exc]) + # Check if it's a default static regex + if exc in _DEFAULT_REGEXES: + result.append(_DEFAULT_REGEXES[exc]) + # Check if it's a site-specific pattern + elif exc in _SITE_SPECIFIC_PATTERNS: + if not site: + raise ValueError(f"Site cannot be None for the '{exc}' regex") + # Use repr(site) as a cache key since site objects may not be hashable + result.append(_compile_site_regex(exc, repr(site), site)) else: # nowiki, noinclude, includeonly, timeline, math and other - # extensions - _regex_cache[exc] = _tag_regex(exc) - result.append(_regex_cache[exc]) + # extensions - use cached tag regex + result.append(_tag_regex(exc)) # handle aliases if exc == 'source': From c474095def419059abd129b4b85ded2823b47af9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 10:52:38 +0000 Subject: [PATCH 3/3] Fix code review issues - improve lru_cache usage Co-authored-by: MrIbrahem <26301308+MrIbrahem@users.noreply.github.com> --- copy_text/text_bot.py | 8 ++++++-- copy_to_en/mdwikicx.py | 10 +++++++--- copy_to_en/medwiki.py | 10 +++++++--- wprefs/bots/replace_except.py | 14 ++++++++------ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py index e82fdeb3..25b5a21f 100644 --- a/copy_text/text_bot.py +++ b/copy_text/text_bot.py @@ -45,7 +45,11 @@ def get_text_revid(x): @lru_cache(maxsize=128) -def get_un_wb_tag(alltext, x): +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -85,7 +89,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- diff --git a/copy_to_en/mdwikicx.py b/copy_to_en/mdwikicx.py index 7b4d9dd8..11a52b62 100644 --- a/copy_to_en/mdwikicx.py +++ b/copy_to_en/mdwikicx.py @@ -71,7 +71,11 @@ def get_text_revid(x): @lru_cache(maxsize=128) -def get_un_wb_tag(alltext, x): +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -104,7 +108,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- @@ -139,7 +143,7 @@ def one_page(x): alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = get_un_wb_tag(alltext, x) + unlinked_tag = get_un_wb_tag(x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/copy_to_en/medwiki.py b/copy_to_en/medwiki.py index ee6843cb..fc4d13d7 100644 --- a/copy_to_en/medwiki.py +++ b/copy_to_en/medwiki.py @@ -120,7 +120,11 @@ def get_text_revid(x): @lru_cache(maxsize=128) -def get_un_wb_tag(alltext, x): +def get_un_wb_tag(x): + # Get alltext from the cached get_text_revid function + alltext, _ = get_text_revid(x) + if not alltext: + return "" # search for text like {{#unlinkedwikibase:id=Q423364}} pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" # --- @@ -156,7 +160,7 @@ def get_text(x): # --- page_cats = get_cats(alltext) # --- - unlinkedwikibase = get_un_wb_tag(alltext, x) + unlinkedwikibase = get_un_wb_tag(x) # --- first = alltext.split("==")[0].strip() # --- @@ -191,7 +195,7 @@ def one_page(x): alltext, _ = get_text_revid(x) # --- if alltext: - unlinked_tag = get_un_wb_tag(alltext, x) + unlinked_tag = get_un_wb_tag(x) # --- alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag) titles[new_title_all] = alltext diff --git a/wprefs/bots/replace_except.py b/wprefs/bots/replace_except.py index 675453b5..0e7bd090 100644 --- a/wprefs/bots/replace_except.py +++ b/wprefs/bots/replace_except.py @@ -131,10 +131,10 @@ def _tag_regex(tag_name: str): @lru_cache(maxsize=128) -def _compile_site_regex(exc: str, site_repr: str, site): - """Compile a site-specific regex pattern.""" - re_text, re_var = _SITE_SPECIFIC_PATTERNS[exc] - return re.compile(re_text % re_var(site), re.VERBOSE) +def _compile_site_regex(exc: str, site_data: str): + """Compile a site-specific regex pattern with the given site data.""" + re_text, _ = _SITE_SPECIFIC_PATTERNS[exc] + return re.compile(re_text % site_data, re.VERBOSE) def _get_regexes(keys, site): @@ -154,8 +154,10 @@ def _get_regexes(keys, site): elif exc in _SITE_SPECIFIC_PATTERNS: if not site: raise ValueError(f"Site cannot be None for the '{exc}' regex") - # Use repr(site) as a cache key since site objects may not be hashable - result.append(_compile_site_regex(exc, repr(site), site)) + # Extract site-specific data and use it as cache key + _, site_func = _SITE_SPECIFIC_PATTERNS[exc] + site_data = site_func(site) + result.append(_compile_site_regex(exc, site_data)) else: # nowiki, noinclude, includeonly, timeline, math and other # extensions - use cached tag regex