Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 9 additions & 16 deletions copy_text/text_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,13 @@

"""
import re
from functools import lru_cache
from apis import mdwiki_api

from copy_to_en.bots import text_changes # text = text_changes.work(text)
from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext)
from mdapi_sql import sql_for_mdwiki

text_cache = {}
revid_cache = {}
un_wb_tag_cache = {}

mdwiki_cats = sql_for_mdwiki.get_db_categories()

full_translate = sql_for_mdwiki.select_md_sql("select DISTINCT tt_title from translate_type where tt_full = 1;", return_dict=True)
Expand Down Expand Up @@ -40,30 +37,26 @@ def get_cats(alltext):
return cats_text


@lru_cache(maxsize=128)
def get_text_revid(x):
alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
# ---
text_cache[x] = alltext
revid_cache[x] = revid
# ---
return alltext, revid


def get_un_wb_tag(alltext, x):
@lru_cache(maxsize=128)
def get_un_wb_tag(x):
# Get alltext from the cached get_text_revid function
alltext, _ = get_text_revid(x)
if not alltext:
return ""
# search for text like {{#unlinkedwikibase:id=Q423364}}
pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
# ---
match = re.search(pattern, alltext)
# ---
unlinkedwikibase = match.group(0) if match else ""
# ---
# matches = re.findall(pattern, alltext)
# for m in matches:
# unlinkedwikibase = m
# break
# ---
un_wb_tag_cache[x] = unlinkedwikibase
# ---
return unlinkedwikibase


Expand Down Expand Up @@ -96,7 +89,7 @@ def get_text(x):
# ---
page_cats = get_cats(alltext)
# ---
unlinkedwikibase = get_un_wb_tag(alltext, x)
unlinkedwikibase = get_un_wb_tag(x)
# ---
first = alltext.split("==")[0].strip()
# ---
Expand Down
29 changes: 11 additions & 18 deletions copy_to_en/mdwikicx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json
import sys
import re
from functools import lru_cache
from pathlib import Path
from multiprocessing import Pool
from apis import cat_cach
Expand All @@ -27,10 +28,6 @@

Dir = Path(__file__).parent

text_cache = {}
revid_cache = {}
un_wb_tag_cache = {}

mdwiki_cats = sql_for_mdwiki.get_db_categories()
# {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0}
# print(mdwiki_cats)
Expand Down Expand Up @@ -66,30 +63,26 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"):
return cat_members


@lru_cache(maxsize=128)
def get_text_revid(x):
alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
# ---
text_cache[x] = alltext
revid_cache[x] = revid
# ---
return alltext, revid


def get_un_wb_tag(alltext, x):
@lru_cache(maxsize=128)
def get_un_wb_tag(x):
# Get alltext from the cached get_text_revid function
alltext, _ = get_text_revid(x)
if not alltext:
return ""
# search for text like {{#unlinkedwikibase:id=Q423364}}
pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
# ---
match = re.search(pattern, alltext)
# ---
unlinkedwikibase = match.group(0) if match else ""
# ---
# matches = re.findall(pattern, alltext)
# for m in matches:
# unlinkedwikibase = m
# break
# ---
un_wb_tag_cache[x] = unlinkedwikibase
# ---
return unlinkedwikibase


Expand All @@ -115,7 +108,7 @@ def get_text(x):
# ---
page_cats = get_cats(alltext)
# ---
unlinkedwikibase = get_un_wb_tag(alltext, x)
unlinkedwikibase = get_un_wb_tag(x)
# ---
first = alltext.split("==")[0].strip()
# ---
Expand Down Expand Up @@ -147,10 +140,10 @@ def one_page(x):
if new_title.find("/") != -1:
new_title_all = f"Md:{x}/fulltext"
# ---
alltext = text_cache.get(x)
alltext, _ = get_text_revid(x)
# ---
if alltext:
unlinked_tag = un_wb_tag_cache.get(x, "")
unlinked_tag = get_un_wb_tag(x)
# ---
alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
titles[new_title_all] = alltext
Expand Down
29 changes: 11 additions & 18 deletions copy_to_en/medwiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import json
import sys
import re
from functools import lru_cache
from pathlib import Path
from multiprocessing import Pool
from apis import cat_cach
Expand All @@ -29,10 +30,6 @@

Dir = Path(__file__).parent

text_cache = {}
revid_cache = {}
un_wb_tag_cache = {}

mdwiki_cats = sql_for_mdwiki.get_db_categories()
# {'RTT': 1, 'RTTCovid': 0, 'RTTHearing': 0, 'RTTOSH': 0, 'World Health Organization essential medicines': 0, 'WHRTT': 0, 'RTTILAE': 0, 'RTTDZ': 0}
# print(mdwiki_cats)
Expand Down Expand Up @@ -115,30 +112,26 @@ def medwiki_cat_members(cat="Category:Mdwiki Translation Dashboard articles"):
return cat_members


@lru_cache(maxsize=128)
def get_text_revid(x):
alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
# ---
text_cache[x] = alltext
revid_cache[x] = revid
# ---
return alltext, revid


def get_un_wb_tag(alltext, x):
@lru_cache(maxsize=128)
def get_un_wb_tag(x):
# Get alltext from the cached get_text_revid function
alltext, _ = get_text_revid(x)
if not alltext:
return ""
# search for text like {{#unlinkedwikibase:id=Q423364}}
pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
# ---
match = re.search(pattern, alltext)
# ---
unlinkedwikibase = match.group(0) if match else ""
# ---
# matches = re.findall(pattern, alltext)
# for m in matches:
# unlinkedwikibase = m
# break
# ---
un_wb_tag_cache[x] = unlinkedwikibase
# ---
return unlinkedwikibase


Expand Down Expand Up @@ -167,7 +160,7 @@ def get_text(x):
# ---
page_cats = get_cats(alltext)
# ---
unlinkedwikibase = get_un_wb_tag(alltext, x)
unlinkedwikibase = get_un_wb_tag(x)
# ---
first = alltext.split("==")[0].strip()
# ---
Expand Down Expand Up @@ -199,10 +192,10 @@ def one_page(x):
if new_title.find("/") != -1:
new_title_all = f"Md:{x}/fulltext"
# ---
alltext = text_cache.get(x)
alltext, _ = get_text_revid(x)
# ---
if alltext:
unlinked_tag = un_wb_tag_cache.get(x, "")
unlinked_tag = get_un_wb_tag(x)
# ---
alltext = alltext_changes.do_all_text(alltext, revid, unlinked_tag)
titles[new_title_all] = alltext
Expand Down
105 changes: 45 additions & 60 deletions wprefs/bots/replace_except.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
import sys
from contextlib import suppress

_regex_cache = {}
from functools import lru_cache

NESTED_TEMPLATE_REGEX = re.compile(
r"""
Expand Down Expand Up @@ -102,54 +101,44 @@ def _tag_pattern(tag_name: str) -> str:
return r'<{0}(?:>|\s+[^>]*(?<!/)>)' r'[\s\S]*?' r'</{0}\s*>'.format(ignore_case(tag_name)) # start tag # contents # end tag


def _create_default_regexes() -> None:
"""Fill (and possibly overwrite) _regex_cache with default regexes."""
_regex_cache.update(
{
# categories
'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])),
'comment': re.compile(r'<!--[\s\S]*?-->'),
# files
'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
# section headers
'header': re.compile(r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' r'=(?:[^\n]|<!--[\s\S]*?-->)+=' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
# external links
'hyperlink': compileLinkR(),
# also finds links to foreign sites with preleading ":"
'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))),
# Module invocations (currently only Lua)
'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
# pagelist tag (used in Proofread extension).
'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))),
# Wikibase property inclusions
'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))),
# lines that start with a colon or more will be indented
'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
# tables often have whitespace that is used to improve wiki
# source code readability.
# TODO: handle nested tables.
'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
'template': NESTED_TEMPLATE_REGEX,
}
)


# Default regex patterns - static patterns that don't require site-specific data
_DEFAULT_REGEXES = {
'comment': re.compile(r'<!--[\s\S]*?-->'),
'header': re.compile(r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' r'=(?:[^\n]|<!--[\s\S]*?-->)+=' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
'hyperlink': compileLinkR(),
'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
'pagelist': re.compile(r'<{}[\s\S]*?/>'.format(ignore_case('pagelist'))),
'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
'table': re.compile(r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
'template': NESTED_TEMPLATE_REGEX,
}

# Patterns that require site-specific data (pattern, site_func)
_SITE_SPECIFIC_PATTERNS = {
'category': (r'\[\[ *(?:%s)\s*:.*?\]\]', lambda site: '|'.join(site.namespaces[14])),
'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
'interwiki': (r'\[\[:?(%s)\s?:[^\]]*\]\]\s*', lambda site: '|'.join(ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))),
'invoke': (r'\{\{\s*\#(?:%s):[\s\S]*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('invoke'))),
'property': (r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}', lambda site: '|'.join(ignore_case(mw) for mw in site.getmagicwords('property'))),
}


@lru_cache(maxsize=128)
def _tag_regex(tag_name: str):
"""Return a compiled tag regex for the given tag name."""
return re.compile(_tag_pattern(tag_name))


@lru_cache(maxsize=128)
def _compile_site_regex(exc: str, site_data: str):
"""Compile a site-specific regex pattern with the given site data."""
re_text, _ = _SITE_SPECIFIC_PATTERNS[exc]
return re.compile(re_text % site_data, re.VERBOSE)


def _get_regexes(keys, site):
"""Fetch compiled regexes."""
if not _regex_cache:
_create_default_regexes()

result = []

for exc in keys:
Expand All @@ -158,25 +147,21 @@ def _get_regexes(keys, site):
result.append(exc)
continue

# assume the string is a reference to a standard regex above,
# which may not yet have a site specific re compiled.
if exc in _regex_cache:
if isinstance(_regex_cache[exc], tuple):
if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'):
raise ValueError(f"Site cannot be None for the '{exc}' regex")

if (exc, site) not in _regex_cache:
re_text, re_var = _regex_cache[exc]
_regex_cache[(exc, site)] = re.compile(re_text % re_var(site), re.VERBOSE)

result.append(_regex_cache[(exc, site)])
else:
result.append(_regex_cache[exc])
# Check if it's a default static regex
if exc in _DEFAULT_REGEXES:
result.append(_DEFAULT_REGEXES[exc])
# Check if it's a site-specific pattern
elif exc in _SITE_SPECIFIC_PATTERNS:
if not site:
raise ValueError(f"Site cannot be None for the '{exc}' regex")
# Extract site-specific data and use it as cache key
_, site_func = _SITE_SPECIFIC_PATTERNS[exc]
site_data = site_func(site)
result.append(_compile_site_regex(exc, site_data))
else:
# nowiki, noinclude, includeonly, timeline, math and other
# extensions
_regex_cache[exc] = _tag_regex(exc)
result.append(_regex_cache[exc])
# extensions - use cached tag regex
result.append(_tag_regex(exc))

# handle aliases
if exc == 'source':
Expand Down
Loading