From b9a07428b0291949ab2a511abcf5e58cbb434f1b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Dec 2025 10:41:35 +0000
Subject: [PATCH 1/3] Initial plan


From 84257d601947478c780f5c30d4e4abae32316d20 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Dec 2025 10:49:15 +0000
Subject: [PATCH 2/3] Replace local cache dictionaries with functools lru_cache

Co-authored-by: MrIbrahem <26301308+MrIbrahem@users.noreply.github.com>
---
 copy_text/text_bot.py         |  17 +-----
 copy_to_en/mdwikicx.py        |  21 ++-----
 copy_to_en/medwiki.py         |  21 ++-----
 wprefs/bots/replace_except.py | 103 ++++++++++++++--------------------
 4 files changed, 56 insertions(+), 106 deletions(-)

diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py
index 846d6146..e82fdeb3 100644
--- a/copy_text/text_bot.py
+++ b/copy_text/text_bot.py
@@ -3,16 +3,13 @@
 
 """
 import re
+from functools import lru_cache
 from apis import mdwiki_api
 
 from copy_to_en.bots import text_changes  # text = text_changes.work(text)
 from copy_to_en.bots.ref import fix_ref  # text = fix_ref(first, alltext)
 from mdapi_sql import sql_for_mdwiki
 
-text_cache = {}
-revid_cache = {}
-un_wb_tag_cache = {}
-
 mdwiki_cats = sql_for_mdwiki.get_db_categories()
 
 full_translate = sql_for_mdwiki.select_md_sql("select DISTINCT tt_title from translate_type where tt_full = 1;", return_dict=True)
@@ -40,15 +37,14 @@ def get_cats(alltext):
     return cats_text
 
 
+@lru_cache(maxsize=128)
 def get_text_revid(x):
     alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
     # ---
-    text_cache[x] = alltext
-    revid_cache[x] = revid
-    # ---
     return alltext, revid
 
 
+@lru_cache(maxsize=128)
 def get_un_wb_tag(alltext, x):
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
@@ -57,13 +53,6 @@ def get_un_wb_tag(alltext, x):
     # ---
     unlinkedwikibase = match.group(0) if match else ""
     # ---
-    # matches = re.findall(pattern, alltext)
-    # for m in matches:
-    #     unlinkedwikibase = m
-    #     break
-    # ---
-    un_wb_tag_cache[x] = unlinkedwikibase
-    # ---
     return unlinkedwikibase
 
 
diff --git a/copy_to_en/mdwikicx.py b/copy_to_en/mdwikicx.py
index 856d671b..7b4d9dd8 100644
--- a/copy_to_en/mdwikicx.py
+++ b/copy_to_en/mdwikicx.py
@@ -12,6 +12,7 @@
 import json
 import sys
 import re
+from functools import lru_cache
 from pathlib import Path
 from multiprocessing import Pool
 from apis import cat_cach
@@ -27,10 +28,6 @@
 
 Dir = Path(__file__).parent
 
-text_cache = {}
-revid_cache = {}
-un_wb_tag_cache = {}
-
 mdwiki_cats = sql_for_mdwiki.get_db_categories()
 # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0}
 # print(mdwiki_cats)
@@ -66,15 +63,14 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"):
     return cat_members
 
 
+@lru_cache(maxsize=128)
 def get_text_revid(x):
     alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
     # ---
-    text_cache[x] = alltext
-    revid_cache[x] = revid
-    # ---
     return alltext, revid
 
 
+@lru_cache(maxsize=128)
 def get_un_wb_tag(alltext, x):
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
@@ -83,13 +79,6 @@ def get_un_wb_tag(alltext, x):
     # ---
     unlinkedwikibase = match.group(0) if match else ""
     # ---
-    # matches = re.findall(pattern, alltext)
-    # for m in matches:
-    #     unlinkedwikibase = m
-    #     break
-    # ---
-    un_wb_tag_cache[x] = unlinkedwikibase
-    # ---
     return unlinkedwikibase
 
 
@@ -147,10 +136,10 @@ def one_page(x):
     if new_title.find("/") != -1:
         new_title_all = f"Md:{x}/fulltext"
         # ---
-        alltext = text_cache.get(x)
+        alltext, _ = get_text_revid(x)
         # ---
         if alltext:
-            unlinked_tag = un_wb_tag_cache.get(x, "")
+            unlinked_tag = get_un_wb_tag(alltext, x)
             # ---
             alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
             titles[new_title_all] = alltext
diff --git a/copy_to_en/medwiki.py b/copy_to_en/medwiki.py
index fd2c0f61..ee6843cb 100644
--- a/copy_to_en/medwiki.py
+++ b/copy_to_en/medwiki.py
@@ -14,6 +14,7 @@
 import json
 import sys
 import re
+from functools import lru_cache
 from pathlib import Path
 from multiprocessing import Pool
 from apis import cat_cach
@@ -29,10 +30,6 @@
 
 Dir = Path(__file__).parent
 
-text_cache = {}
-revid_cache = {}
-un_wb_tag_cache = {}
-
 mdwiki_cats = sql_for_mdwiki.get_db_categories()
 # {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0}
 # print(mdwiki_cats)
@@ -115,15 +112,14 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"):
     return cat_members
 
 
+@lru_cache(maxsize=128)
 def get_text_revid(x):
     alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
     # ---
-    text_cache[x] = alltext
-    revid_cache[x] = revid
-    # ---
     return alltext, revid
 
 
+@lru_cache(maxsize=128)
 def get_un_wb_tag(alltext, x):
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
@@ -132,13 +128,6 @@ def get_un_wb_tag(alltext, x):
     # ---
     unlinkedwikibase = match.group(0) if match else ""
     # ---
-    # matches = re.findall(pattern, alltext)
-    # for m in matches:
-    #     unlinkedwikibase = m
-    #     break
-    # ---
-    un_wb_tag_cache[x] = unlinkedwikibase
-    # ---
     return unlinkedwikibase
 
 
@@ -199,10 +188,10 @@ def one_page(x):
     if new_title.find("/") != -1:
         new_title_all = f"Md:{x}/fulltext"
         # ---
-        alltext = text_cache.get(x)
+        alltext, _ = get_text_revid(x)
         # ---
         if alltext:
-            unlinked_tag = un_wb_tag_cache.get(x, "")
+            unlinked_tag = get_un_wb_tag(alltext, x)
             # ---
             alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
             titles[new_title_all] = alltext
diff --git a/wprefs/bots/replace_except.py b/wprefs/bots/replace_except.py
index c33aea7f..675453b5 100644
--- a/wprefs/bots/replace_except.py
+++ b/wprefs/bots/replace_except.py
@@ -1,8 +1,7 @@
 import re
 import sys
 from contextlib import suppress
-
-_regex_cache = {}
+from functools import lru_cache
 
 NESTED_TEMPLATE_REGEX = re.compile(
     r"""
@@ -102,54 +101,44 @@ def _tag_pattern(tag_name: str) -> str:
     return r'<{0}(?:>|\s+[^>]*(?<!/)>)' r'[\s\S]*?' r'</{0}\s*>'.format(ignore_case(tag_name))  # start tag  # contents  # end tag
 
 
-def _create_default_regexes() -> None:
-    """Fill (and possibly overwrite) _regex_cache with default regexes."""
-    _regex_cache.update(
-        {
-            # categories
-            'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])),
-            'comment': re.compile(r'<!--[\s\S]*?-->'),
-            # files
-            'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
-            # section headers
-            'header': re.compile(r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' r'=(?:[^\n]|<!--[\s\S]*?-->)+=' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
-            # external links
-            'hyperlink': compileLinkR(),
-            # also finds links to foreign sites with preleading ":"
-            'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))),
-            # Module invocations (currently only Lua)
-            'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))),
-            # this matches internal wikilinks, but also interwiki, categories, and
-            # images.
-            'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
-            # pagelist tag (used in Proofread extension).
-            'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))),
-            # Wikibase property inclusions
-            'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))),
-            # lines that start with a colon or more will be indented
-            'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
-            # lines that start with a space are shown in a monospace font and
-            # have whitespace preserved.
-            'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
-            # tables often have whitespace that is used to improve wiki
-            # source code readability.
-            # TODO: handle nested tables.
-            'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
-            'template': NESTED_TEMPLATE_REGEX,
-        }
-    )
-
-
+# Default regex patterns - static patterns that don't require site-specific data
+_DEFAULT_REGEXES = {
+    'comment': re.compile(r'<!--[\s\S]*?-->'),
+    'header': re.compile(r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' r'=(?:[^\n]|<!--[\s\S]*?-->)+=' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
+    'hyperlink': compileLinkR(),
+    'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
+    'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))),
+    'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
+    'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
+    'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
+    'template': NESTED_TEMPLATE_REGEX,
+}
+
+# Patterns that require site-specific data (pattern, site_func)
+_SITE_SPECIFIC_PATTERNS = {
+    'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])),
+    'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
+    'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))),
+    'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))),
+    'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))),
+}
+
+
+@lru_cache(maxsize=128)
 def _tag_regex(tag_name: str):
     """Return a compiled tag regex for the given tag name."""
     return re.compile(_tag_pattern(tag_name))
 
 
+@lru_cache(maxsize=128)
+def _compile_site_regex(exc: str, site_repr: str, site):
+    """Compile a site-specific regex pattern."""
+    re_text, re_var = _SITE_SPECIFIC_PATTERNS[exc]
+    return re.compile(re_text % re_var(site), re.VERBOSE)
+
+
 def _get_regexes(keys, site):
     """Fetch compiled regexes."""
-    if not _regex_cache:
-        _create_default_regexes()
-
     result = []
 
     for exc in keys:
@@ -158,25 +147,19 @@ def _get_regexes(keys, site):
             result.append(exc)
             continue
 
-        # assume the string is a reference to a standard regex above,
-        # which may not yet have a site specific re compiled.
-        if exc in _regex_cache:
-            if isinstance(_regex_cache[exc], tuple):
-                if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'):
-                    raise ValueError(f"Site cannot be None for the '{exc}' regex")
-
-                if (exc, site) not in _regex_cache:
-                    re_text, re_var = _regex_cache[exc]
-                    _regex_cache[(exc, site)] = re.compile(re_text % re_var(site), re.VERBOSE)
-
-                result.append(_regex_cache[(exc, site)])
-            else:
-                result.append(_regex_cache[exc])
+        # Check if it's a default static regex
+        if exc in _DEFAULT_REGEXES:
+            result.append(_DEFAULT_REGEXES[exc])
+        # Check if it's a site-specific pattern
+        elif exc in _SITE_SPECIFIC_PATTERNS:
+            if not site:
+                raise ValueError(f"Site cannot be None for the '{exc}' regex")
+            # Use repr(site) as a cache key since site objects may not be hashable
+            result.append(_compile_site_regex(exc, repr(site), site))
         else:
             # nowiki, noinclude, includeonly, timeline, math and other
-            # extensions
-            _regex_cache[exc] = _tag_regex(exc)
-            result.append(_regex_cache[exc])
+            # extensions - use cached tag regex
+            result.append(_tag_regex(exc))
 
         # handle aliases
         if exc == 'source':

From c474095def419059abd129b4b85ded2823b47af9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Dec 2025 10:52:38 +0000
Subject: [PATCH 3/3] Fix code review issues - improve lru_cache usage

Co-authored-by: MrIbrahem <26301308+MrIbrahem@users.noreply.github.com>
---
 copy_text/text_bot.py         |  8 ++++++--
 copy_to_en/mdwikicx.py        | 10 +++++++---
 copy_to_en/medwiki.py         | 10 +++++++---
 wprefs/bots/replace_except.py | 14 ++++++++------
 4 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py
index e82fdeb3..25b5a21f 100644
--- a/copy_text/text_bot.py
+++ b/copy_text/text_bot.py
@@ -45,7 +45,11 @@ def get_text_revid(x):
 
 
 @lru_cache(maxsize=128)
-def get_un_wb_tag(alltext, x):
+def get_un_wb_tag(x):
+    # Get alltext from the cached get_text_revid function
+    alltext, _ = get_text_revid(x)
+    if not alltext:
+        return ""
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
     # ---
@@ -85,7 +89,7 @@ def get_text(x):
     # ---
     page_cats = get_cats(alltext)
     # ---
-    unlinkedwikibase = get_un_wb_tag(alltext, x)
+    unlinkedwikibase = get_un_wb_tag(x)
     # ---
     first = alltext.split("==")[0].strip()
     # ---
diff --git a/copy_to_en/mdwikicx.py b/copy_to_en/mdwikicx.py
index 7b4d9dd8..11a52b62 100644
--- a/copy_to_en/mdwikicx.py
+++ b/copy_to_en/mdwikicx.py
@@ -71,7 +71,11 @@ def get_text_revid(x):
 
 
 @lru_cache(maxsize=128)
-def get_un_wb_tag(alltext, x):
+def get_un_wb_tag(x):
+    # Get alltext from the cached get_text_revid function
+    alltext, _ = get_text_revid(x)
+    if not alltext:
+        return ""
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
     # ---
@@ -104,7 +108,7 @@ def get_text(x):
     # ---
     page_cats = get_cats(alltext)
     # ---
-    unlinkedwikibase = get_un_wb_tag(alltext, x)
+    unlinkedwikibase = get_un_wb_tag(x)
     # ---
     first = alltext.split("==")[0].strip()
     # ---
@@ -139,7 +143,7 @@ def one_page(x):
         alltext, _ = get_text_revid(x)
         # ---
         if alltext:
-            unlinked_tag = get_un_wb_tag(alltext, x)
+            unlinked_tag = get_un_wb_tag(x)
             # ---
             alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
             titles[new_title_all] = alltext
diff --git a/copy_to_en/medwiki.py b/copy_to_en/medwiki.py
index ee6843cb..fc4d13d7 100644
--- a/copy_to_en/medwiki.py
+++ b/copy_to_en/medwiki.py
@@ -120,7 +120,11 @@ def get_text_revid(x):
 
 
 @lru_cache(maxsize=128)
-def get_un_wb_tag(alltext, x):
+def get_un_wb_tag(x):
+    # Get alltext from the cached get_text_revid function
+    alltext, _ = get_text_revid(x)
+    if not alltext:
+        return ""
     # search for text like {{#unlinkedwikibase:id=Q423364}}
     pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
     # ---
@@ -156,7 +160,7 @@ def get_text(x):
     # ---
     page_cats = get_cats(alltext)
     # ---
-    unlinkedwikibase = get_un_wb_tag(alltext, x)
+    unlinkedwikibase = get_un_wb_tag(x)
     # ---
     first = alltext.split("==")[0].strip()
     # ---
@@ -191,7 +195,7 @@ def one_page(x):
         alltext, _ = get_text_revid(x)
         # ---
         if alltext:
-            unlinked_tag = get_un_wb_tag(alltext, x)
+            unlinked_tag = get_un_wb_tag(x)
             # ---
             alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
             titles[new_title_all] = alltext
diff --git a/wprefs/bots/replace_except.py b/wprefs/bots/replace_except.py
index 675453b5..0e7bd090 100644
--- a/wprefs/bots/replace_except.py
+++ b/wprefs/bots/replace_except.py
@@ -131,10 +131,10 @@ def _tag_regex(tag_name: str):
 
 
 @lru_cache(maxsize=128)
-def _compile_site_regex(exc: str, site_repr: str, site):
-    """Compile a site-specific regex pattern."""
-    re_text, re_var = _SITE_SPECIFIC_PATTERNS[exc]
-    return re.compile(re_text % re_var(site), re.VERBOSE)
+def _compile_site_regex(exc: str, site_data: str):
+    """Compile a site-specific regex pattern with the given site data."""
+    re_text, _ = _SITE_SPECIFIC_PATTERNS[exc]
+    return re.compile(re_text % site_data, re.VERBOSE)
 
 
 def _get_regexes(keys, site):
@@ -154,8 +154,10 @@ def _get_regexes(keys, site):
         elif exc in _SITE_SPECIFIC_PATTERNS:
             if not site:
                 raise ValueError(f"Site cannot be None for the '{exc}' regex")
-            # Use repr(site) as a cache key since site objects may not be hashable
-            result.append(_compile_site_regex(exc, repr(site), site))
+            # Extract site-specific data and use it as cache key
+            _, site_func = _SITE_SPECIFIC_PATTERNS[exc]
+            site_data = site_func(site)
+            result.append(_compile_site_regex(exc, site_data))
         else:
             # nowiki, noinclude, includeonly, timeline, math and other
             # extensions - use cached tag regex