diff --git a/README.md b/README.md
index 1298885..ca1f1ab 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,21 @@
```bash
pip install -r requirements.txt
+ pip install -e .
```
4. **Run the Application**
+ ```bash
+ flask --app ./translatable_wikitext_converter/app.py run --port 5000
+ ```
+ As an alternative:
```bash
- python app.py
+ python -m translatable_wikitext_converter.app
```
-
+5. **Run the tests**
+ ```bash
+ python ./translatable_wikitext_converter/tests.py
+ ```
The application will start on http://127.0.0.1:5000.
## Usage
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..397eacb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "translatable-wikitext-converter"
+version = "0.1.0"
+description = "Convert wikitext into translatable wikitext"
+authors = [
+ { name = "Gopa Vasanth" }
+]
+requires-python = ">=3.9"
diff --git a/tests.py b/tests.py
deleted file mode 100644
index 2112349..0000000
--- a/tests.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import unittest
-from app import convert_to_translatable_wikitext, process_double_brackets
-
-class TestTranslatableWikitext(unittest.TestCase):
-
- def test_section_headers(self):
- self.assertEqual(
- convert_to_translatable_wikitext("==HELLO=="),
- "
\nhappy\n\n'),
- '\n
\n
This is a quote."), - "
" - ) - - def test_poem_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("This is a quote.
some code for you."),
- "some code for you.Send a POST request with JSON data to use this API.
+Example:
+
+ curl -X POST https://translatetagger.toolforge.org/api/convert \\
+ -H "Content-Type: application/json" \\
+ -d '{"wikitext": "This is a test [[link|example]]"}'
+
+ """
+ elif request.method == 'POST':
+ data = request.get_json()
+ if not data or 'wikitext' not in data:
+ return jsonify({'error': 'Missing "wikitext" in JSON payload'}), 400
+
+ wikitext = data.get('wikitext', '')
+ converted_text = convert_to_translatable_wikitext(wikitext)
+
+ return jsonify({
+ 'original': wikitext,
+ 'converted': converted_text
+ })
+
+if __name__ == '__main__':
+ app.run(debug=True, port=5000)
diff --git a/translatable_wikitext_converter/butta.py b/translatable_wikitext_converter/butta.py
new file mode 100644
index 0000000..6cd3995
--- /dev/null
+++ b/translatable_wikitext_converter/butta.py
@@ -0,0 +1,113 @@
+import re
+
+def fix_section_title_spacing_internal(title: str) -> str:
+ """
+ Detects a section title and ensures there is exactly one space
+ between the '=' characters and the title text.
+ """
+ # Pattern: (={2,}) [optional space] (.+?) [optional space] \1
+ pattern = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL)
+
+ # Replacement: \1 [space] \2 [space] \1
+ return pattern.sub(r'\1 \2 \1', title)
+
+
+
+# --- Main Function to Fix Wiki Page Spacing ---
+
+def fix_wiki_page_spacing(wiki_text: str) -> str:
+ """
+ Applies the section title spacing fix and enforces consistent newlines
+ before (one blank line: \n\n) and after (one blank line: \n\n)
+ every section heading (Level 2 or higher).
+
+ This method guarantees the output format:
+ ...[Content]\n\n== Title ==\n\n[Next content]...
+
+ :param wiki_text: The full text of the wiki page.
+ :return: The corrected wiki page text.
+ """
+
+ # Pattern to match and replace a heading and its surrounding whitespace:
+ # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading.
+ # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading.
+ # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M).
+ # 4. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading.
+
+ # We use re.M (multiline) and re.DOTALL (dot matches newline)
+ heading_and_surroundings_pattern = re.compile(
+ r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL
+ )
+
+ def heading_replacer_full_format(match):
+ """
+ Callback function for re.sub that fixes spacing and enforces \n\n separation.
+ """
+ # Group 1: Content preceding the heading
+ content_before = match.group(1).rstrip()
+ # Group 2: The raw heading line
+ raw_heading = match.group(2)
+
+ # 1. Fix the internal spacing of the heading
+ corrected_heading = fix_section_title_spacing_internal(raw_heading)
+
+ # 2. Determine the prefix separator: \n\n
+ # If the heading is the first thing on the page (i.e., content_before is empty),
+ # we don't want to prepend \n\n. Otherwise, we do.
+ if content_before:
+ prefix = '\n\n'
+ else:
+ prefix = ''
+
+ # 3. The replacement structure:
+ # {Content Before}{Prefix}\n{Corrected Heading}\n\n
+ # The content that follows this match will immediately follow the final \n\n.
+ return f'{content_before}{prefix}{corrected_heading}\n\n'
+
+ # Apply the fix globally
+ corrected_text = heading_and_surroundings_pattern.sub(
+ heading_replacer_full_format,
+ wiki_text
+ )
+
+ # Clean up any residual excess newlines at the very beginning of the page
+ return corrected_text.lstrip('\r\n')
+
+
+def main():
+ """Hard-coded wiki page text for testing and debugging."""
+
+ # Text demonstrates various input issues:
+ # 1. Title 1: No internal space, no newline after content. (Needs \n\n before and after)
+ # 2. Title 2: Too much internal space, one newline after content.
+ # 3. Title 3: Correct internal space, three newlines after content.
+ # 4. Title 4: Starts immediately after content (missing newline before).
+
+ raw_wiki_page_text = (
+ "This is the header text.\n"
+ "This is the last line of the header.\n" # Content before first heading
+ "==Topic1==\n\n\n" # Missing \n before, too many \n after
+ "Content for topic 1.\n"
+ "Content continues...\n"
+ "=== Topic2 ===\n" # Missing \n before, one \n after
+ "Content for topic 2.\n"
+ "== Topic3 ==\n\n\n"
+ "Content for topic 3. Correct space, too many \n after.\n"
+ "Some more content.\n"
+ "====Topic4====\n" # Missing \n before, missing \n after
+ "Final content."
+ )
+
+ print("--- Original Wiki Page Text ---\n")
+ print(raw_wiki_page_text)
+ print("\n" + "="*60 + "\n")
+
+ corrected_text = fix_wiki_page_spacing(raw_wiki_page_text)
+
+ print("--- Corrected Wiki Page Text (Enforcing: \n\n== Title ==\n\n) ---\n")
+ print(corrected_text)
+ print("\n" + "="*60 + "\n")
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/templates/home.html b/translatable_wikitext_converter/templates/home.html
similarity index 100%
rename from templates/home.html
rename to translatable_wikitext_converter/templates/home.html
diff --git a/translatable_wikitext_converter/tests.py b/translatable_wikitext_converter/tests.py
new file mode 100644
index 0000000..00226cb
--- /dev/null
+++ b/translatable_wikitext_converter/tests.py
@@ -0,0 +1,238 @@
+import unittest
+
+from translatable_wikitext_converter.app import tag_for_translation
+
+class TestTranslatableWikitext(unittest.TestCase):
+
+ def test_section_headers(self):
+ self.assertEqual(
+ tag_for_translation("==HELLO=="),
+ """This is a quote."), + "
" + ) + + def test_poem_tag(self): + self.assertEqual( + tag_for_translation("This is a quote.
some code for you."),
+ """some code for you. tags in the wikitext.
It wraps the content in tags.
@@ -153,7 +81,7 @@ def process_code_tag(text, tvar_code_id=0):
if not content:
return text
# Wrap the content in tags
- wrapped_content = f'{content} '
+ wrapped_content = f'{content} '
return f"{prefix}{wrapped_content}{suffix}"
def process_div(text):
@@ -280,7 +208,7 @@ def process_item(text):
item_content = text[offset:].strip()
if not item_content:
return text
- return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n'
+ return text[:offset] + ' ' + convert_to_translatable_wikitext(item_content) + '\n'
class double_brackets_types(Enum):
wikilink = 1
@@ -289,8 +217,8 @@ class double_brackets_types(Enum):
not_inline_icon_file = 4
special = 5
invalid_file = 6
-
-def _process_file(s, tvar_inline_icon_id=0):
+
+def _process_file(s):
# Define keywords that should NOT be translated when found as parameters
NON_TRANSLATABLE_KEYWORDS = {
'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none',
@@ -351,7 +279,7 @@ def _process_file(s, tvar_inline_icon_id=0):
if is_inline_icon:
# return something like: [[File:smiley.png|alt=🙂]]
- returnline = f'[[' + '|'.join(tokens) + ']] '
+ returnline = f'[[' + '|'.join(tokens) + ']] '
return returnline, double_brackets_types.inline_icon
############################
@@ -387,7 +315,7 @@ def _process_file(s, tvar_inline_icon_id=0):
returnline = '[[' + '|'.join(output_parts) + ']]'
return returnline, double_brackets_types.not_inline_icon_file
-def process_double_brackets(text, tvar_id=0):
+def process_double_brackets(text):
"""
Processes internal links in the wikitext.
It wraps the content in tags.
@@ -404,7 +332,9 @@ def process_double_brackets(text, tvar_id=0):
category_aliases = ['Category:', 'category:', 'Cat:', 'cat:']
file_aliases = ['File:', 'file:', 'Image:', 'image:']
- parts[0] = parts[0].strip() # Clean up the first part
+ # strip all parts
+ parts = [part.strip() for part in parts]
+
# Check if the first part is a category or file alias
if parts[0].startswith(tuple(category_aliases)):
# Handle category links
@@ -417,14 +347,174 @@ def process_double_brackets(text, tvar_id=0):
# Handle special pages
return f'[[{parts[0]}]]', double_brackets_types.special
- # Assuming it's a regular internal link
+ #############################
+ # Managing wikilinks
+ #############################
+
+ # List of recognised prefixes for Wikimedia projects (e.g., wikipedia, commons)
+ # and local/national chapters (e.g., wmde, wmit).
+ interwiki_prefixes = [
+ # Main Projects
+ "wikipedia", "w",
+ "wiktionary", "wikt",
+ "wikinews", "n",
+ "wikibooks", "b",
+ "wikiquote", "q",
+ "wikisource", "s",
+ "oldwikisource", "s:mul",
+ "wikispecies", "species",
+ "wikiversity", "v",
+ "wikivoyage", "voy",
+ "wikimedia", "foundation", "wmf",
+ "commons", "c",
+ "metawiki", "metawikimedia", "metawikipedia", "meta", "m",
+ "incubator",
+ "strategy",
+ "mediawikiwiki", "mw",
+ "mediazilla", "bugzilla",
+ "phabricator", "phab",
+ "testwiki",
+ "wikidata", "d",
+ "wikifunctions", "f",
+ "wikitech",
+ "toolforge",
+
+ # National Chapters
+ "wmar", "wmau", "wmbd", "wmbe", "wmbr", "wmca", "wmcz", "wmdk",
+ "wmde", "wmfi", "wmhk", "wmhu", "wmin", "wmid", "wmil", "wmit",
+ "wmnl", "wmmk", "wmno", "wmpl", "wmru", "wmrs", "wmes", "wmse",
+ "wmch", "wmtw", "wmua", "wmuk",
+
+ # Other Wikimedia Prefixes
+ "betawikiversity", "v:mul",
+ "download", "dbdump", "gerrit", "mail", "mailarchive",
+ "outreach", "otrs", "OTRSwiki", "quality", "spcom",
+ "ticket", "tools", "tswiki", "svn", "sulutil",
+ "rev", "wmania", "wm2016", "wm2017"
+ ]
+ # Convert the list to a set for efficient lookup/checking.
+ interwiki_prefixes_set = set(interwiki_prefixes)
+ # Regex to identify if the link starts with a language code (e.g., 'it:', 'bn:').
+ LANGUAGE_CODE_PATTERN = re.compile(r'^[a-z]{2,3}:')
+
+ # Determine the link target (before the pipe) and the display text (after the pipe).
+ link_title = parts[0]
+ # If a pipe is present, use the part after it; otherwise, use the link target itself.
+ display_text = parts[1] if len(parts) > 1 else parts[0]
+
+ # --- 1. Checking for Project/Chapter/Interwiki Prefixes ---
+
+ # We try to extract the prefix (e.g. ":bn:" from ":bn:Page")
+ first_part_lower = link_title.lower()
+
+ has_known_prefix = False
+
+ # A. Check 1: Simple Language Code Match (e.g., ":it:", ":bn:")
+ # This covers the explicit requirement: "se inizia con un codice linguistico e i due punti..."
+ if LANGUAGE_CODE_PATTERN.match(first_part_lower):
+ has_known_prefix = True
+
+ # B. Check 2: Complex Prefix Parsing (Covers "w:", "commons:", "wmde:", or combined forms)
+ elif ':' in first_part_lower:
+ # Split the link by colon, excluding the last part which is the page title.
+ # Example: ":bn:s:Page" -> segments: ['','bn','s']
+ # Example: ":w:de:Page" -> segments: ['', 'w','de']
+ # Example: ":commons:File" -> segments: ['', 'commons']
+
+ segments = first_part_lower.split(':')
+
+ # We look at all segments except the last one (which is the actual page title).
+ # We stop the search if the last segment (the title) is empty, which happens for links ending in a colon.
+ # e.g., 'w:' splits to ['w', ''] -> we check 'w'.
+ limit = len(segments) - 1
+ if segments[-1] == '':
+ limit = len(segments) - 2
+
+ # Iterate through all prefix segments
+ for segment in segments[:limit]:
+ # The empty string segment resulting from a leading colon (e.g., ':w:de:Page' -> first segment is '') is ignored.
+ if segment:
+ # Check if the segment is a known project/chapter prefix.
+ if segment in interwiki_prefixes_set:
+ has_known_prefix = True
+ break # Stop checking once any known prefix is found
+
+ # Check if the segment is a language code (e.g., 'de' in 'w:de:Page').
+ # We can't use the regex pattern here as it checks for start-of-string.
+ # A quick check for typical language code length (2 or 3 chars) is used as a proxy,
+ # although a full language code check would be more robust.
+ if 2 <= len(segment) <= 3:
+ # Assuming a 2/3 letter segment that isn't a known prefix is treated as a language code
+ # for the purpose of avoiding Special:MyLanguage.
+ has_known_prefix = True
+ break
+
+ # If the link is complex (multiple colons) or contains a known prefix,
+ # then it is an interwiki link and should not be routed through Special:MyLanguage.
+ # The check below remains the same, but 'has_known_prefix' is now robustly set.
+
+ if has_known_prefix or ':' in link_title:
+ # If it has a prefix (linguistic or project/chapter), DO NOT use Special:MyLanguage.
+
+ # --- 2. Special handling for the ":en:" prefix ---
+ if first_part_lower.startswith(':en:'):
+ # For links starting with ':en:', rewrite using the {{lwp|...}} template.
+
+ # The suffix is the page title *without* the ":en:" prefix.
+ en_suffix = link_title[4:] # Removes ":en:"
+ capitalised_en_suffix = capitalise_first_letter(en_suffix)
+ # Case 1: No pipe (e.g., "[[en:About]]")
+ if len(parts) == 1:
+ # Target: {{lwp|About}}. Display text: About (en_suffix).
+ return f'[[{{{{lwp|{capitalised_en_suffix}}}}} |{en_suffix}]]', double_brackets_types.wikilink
+
+ # Case 2: With pipe (e.g., "[[en:About|Read More]]")
+ if len(parts) == 2:
+ # Target: {{lwp|About}}. Display text: Read More (display_text).
+ return f'[[{{{{lwp|{capitalised_en_suffix}}}}} |{display_text}]]', double_brackets_types.wikilink
+
+ # --- 3. Handling all other interwiki/prefixed links (e.g., ":it:", "w:", "wmde:") ---
+
+ # Find the index of the *last* colon to correctly separate the page title
+ # from the potentially complex prefix (e.g., extract 'Page' from 'bn:Page').
+ if link_title.rfind(':') != -1:
+ # Extract the page title by finding the content after the final colon.
+ title_without_prefix = link_title[link_title.rfind(':') + 1:]
+ else:
+ # Should not happen for prefixed links, but handles the fallback gracefully.
+ title_without_prefix = link_title
+
+ # Case 1: No pipe (e.g., "[[bn:Page]]" or "[[w:Page]]")
+ if len(parts) == 1:
+ # Link target remains link_title (e.g., bn:Page).
+ # Display text is the title *without* the prefix (e.g., Page).
+ return f'[[{link_title} |{title_without_prefix}]]', double_brackets_types.wikilink
+
+ # Case 2: With pipe (e.g., "[[bn:Page|Text]]")
+ if len(parts) == 2:
+ # Link target remains link_title (e.g., bn:Page).
+ # Display text is the text after the pipe (e.g., Text).
+ return f'[[{link_title} |{display_text}]]', double_brackets_types.wikilink
+
+ # --- 4. Standard internal links (No special prefix found) ---
+
+ # For standard internal links, the target must be prefixed with Special:MyLanguage
+ # to enable automatic localisation. 'capitalise_first_letter' is required here.
+
+ # Case 1: No pipe (e.g., [[Page]])
if len(parts) == 1:
- return f'[[Special:MyLanguage /{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink
- if len(parts) == 2 :
- return f'[[Special:MyLanguage /{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink
+ # Target: Special:MyLanguage/Page. Display text: Page (link_title).
+ return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)} |{link_title}]]', double_brackets_types.wikilink
+
+ # Case 2: With pipe (e.g., [[Page|Text]])
+ if len(parts) == 2:
+ # Target: Special:MyLanguage/Page. Display text: Text (display_text).
+ return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)} |{display_text}]]', double_brackets_types.wikilink
+
+ # Fallback for unexpected link format (e.g., more than one pipe).
return text
-def process_external_link(text, tvar_url_id=0):
+def process_external_link(text):
"""
Processes external links in the format [http://example.com Description] and ensures
that only the description part is wrapped in tags, leaving the URL untouched.
@@ -435,7 +525,7 @@ def process_external_link(text, tvar_url_id=0):
url_part = match.group(1)
description_part = match.group(2)
# Wrap only the description part in tags, leave the URL untouched
- return f'[{url_part} {description_part}]'
+ return f'[{url_part} {description_part}]'
return text
def process_template(text):
@@ -466,6 +556,9 @@ def process_raw_url(text):
return text
return text.strip()
+def tag_for_translation(text):
+ converted_text = convert_to_translatable_wikitext(text)
+ return set_tvar_names(converted_text)
# --- Main Tokenisation Logic ---
@@ -479,6 +572,9 @@ def convert_to_translatable_wikitext(wikitext):
if not wikitext:
return ""
+ wikitext = wikitext.replace('\r\n', '\n').replace('\r', '\n')
+ wikitext = fix_wiki_page_spacing(wikitext)
+
# add an extra newline at the beginning, useful to process items at the beginning of the text
wikitext = '\n' + wikitext
@@ -496,7 +592,7 @@ def convert_to_translatable_wikitext(wikitext):
if last < curr:
parts.append((wikitext[last:curr], _wrap_in_translate))
parts.append((wikitext[curr:end_pattern], process_syntax_highlight))
- curr = end_pos
+ curr = end_pattern
last = curr
continue
# Table block
@@ -731,37 +827,33 @@ def convert_to_translatable_wikitext(wikitext):
"""
# Process links
- tvar_id = 0
- tvar_url_id = 0
- tvar_code_id = 0
- tvar_inline_icon_id = 0
for i, (part, handler) in enumerate(parts):
# Handlers for links require a tvar_id
if handler == process_double_brackets:
- new_part, double_brackets_type = handler(part, tvar_id)
+ new_part, double_brackets_type = handler(part)
if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]:
new_handler = _wrap_in_translate # Change handler to _wrap_in_translate
else :
new_handler = lambda x: x # No further processing for categories and files
parts[i] = (new_part, new_handler)
- tvar_id += 1
elif handler == process_external_link:
- new_part = handler(part, tvar_url_id)
+ new_part = handler(part)
new_handler = _wrap_in_translate # Change handler to _wrap_in_translate
parts[i] = (new_part, new_handler)
- tvar_url_id += 1
elif handler == process_code_tag:
- new_part = handler(part, tvar_code_id)
+ new_part = handler(part)
new_handler = _wrap_in_translate # Change handler to _wrap_in_translate
parts[i] = (new_part, new_handler)
- tvar_code_id += 1
elif handler == process_double_brackets :
- new_part, double_brackets_type = handler(part, tvar_inline_icon_id)
+ new_part, double_brackets_type = handler(part)
if double_brackets_type == double_brackets_types.inline_icon:
new_handler = _wrap_in_translate # Change handler to _wrap_in_translate
- tvar_inline_icon_id += 1
else:
new_handler = lambda x: x
+ elif handler == process_syntax_highlight :
+ new_part = handler(part)
+ new_handler = _wrap_in_translate # Change handler to _wrap_in_translate
+ parts[i] = (new_part, new_handler)
# Scan again the parts: merge consecutive parts handled by _wrap_in_translate
_parts = []
@@ -785,54 +877,111 @@ def convert_to_translatable_wikitext(wikitext):
print("Processed parts:")
for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)):
print(f"--- Start element {i} with handler {handler.__name__} ---")
- print(part)
+ print(f"@{part}@")
print(f"---\n")
- print(ppart)
+ print(f'@{ppart}@')
print(f"---\n")
"""
# Join the processed parts into a single string
- return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning
-
-@app.route('/')
-def index():
- return render_template('home.html')
-
-@app.route('/convert', methods=['GET'])
-def redirect_to_home():
- return render_template('home.html')
-
-@app.route('/convert', methods=['POST'])
-def convert():
- wikitext = request.form.get('wikitext', '')
- converted_text = convert_to_translatable_wikitext(wikitext)
- return render_template('home.html', original=wikitext, converted=converted_text)
-
-@app.route('/api/convert', methods=['GET', 'POST'])
-def api_convert():
- if request.method == 'GET':
- return """
- Translate Tagger API
- Send a POST request with JSON data to use this API.
- Example:
-
- curl -X POST https://translatetagger.toolforge.org/api/convert \\
- -H "Content-Type: application/json" \\
- -d '{"wikitext": "This is a test [[link|example]]"}'
-
+ out_wikitext = ''.join(processed_parts)
+
+ # Keep removing all trailing and leading newlines and spaces
+ while out_wikitext.startswith('\n') or out_wikitext.startswith(' ') or out_wikitext.endswith('\n') or out_wikitext.endswith(' '):
+ out_wikitext = out_wikitext.strip('\n')
+ out_wikitext = out_wikitext.strip(' ')
+
+ return out_wikitext
+
+def set_tvar_names(input_text: str) -> str:
+ """
+ Sets the 'name' attribute of every tag inside a block,
+ using an increasing counter (starting from 1) for each block.
+
+ This version assumes tags are initially simple, e.g., or .
+
+ Args:
+ input_text: The input string containing and tags.
+
+ Returns:
+ The modified string with the 'name' attributes set.
+ """
+
+ # 1. Regular expression to find all blocks, including content.
+ # We use re.DOTALL to ensure the match spans multiple lines.
+ translate_pattern = re.compile(r'(.*?<\/translate>)', re.DOTALL)
+
+ def process_translate_block(full_block_match):
"""
- elif request.method == 'POST':
- data = request.get_json()
- if not data or 'wikitext' not in data:
- return jsonify({'error': 'Missing "wikitext" in JSON payload'}), 400
+ Callback function for re.sub that processes one block.
+ It finds all simple tags inside and gives them an incremental 'name' attribute.
+ """
+ # The entire matched block
+ full_block = full_block_match.group(0)
+
+ # Initialise the counter for the current block
+ count = 1
+
+ def substitute_simple_tvar(tvar_match):
+ """
+ Inner callback function to substitute a simple and increment the counter.
+ """
+ nonlocal count
+
+ # The match group 1 captures the opening tag parts: ' becomes
+ # or becomes
+
+ # This expression handles both and by replacing the final '>' or '/>'
+ # with the insertion plus the captured closing part (group 2).
+ name_attribute = f' name="{count}"'
+
+ # Group 2 captures the closing element (either '>' or '/>')
+ closing_part = tvar_match.group(2)
+
+ new_tag = f'{opening_part}{name_attribute}{closing_part}'
+
+ # Increment the counter for the next
+ count += 1
+
+ return new_tag
+
+ # Internal pattern: finds or where 'name' is not present.
+ # This is a robust pattern for HTML/XML tags where an attribute is to be inserted
+ # right before the closing bracket.
- wikitext = data.get('wikitext', '')
- converted_text = convert_to_translatable_wikitext(wikitext)
+ # Group 1: () - The closing angle bracket (possibly with / for self-closing)
+ # We need to ensure we don't accidentally match existing 'name' attributes.
- return jsonify({
- 'original': wikitext,
- 'converted': converted_text
- })
+ # Simpler pattern for *all* tags, assuming no existing name:
+ tvar_pattern_inner = re.compile(r'()', re.DOTALL)
-if __name__ == '__main__':
- app.run(debug=True)
+ # To strictly avoid tags that *already* contain 'name':
+ # We use a negative lookahead to ensure "name=" is not present inside
+ # This pattern is more complex but safer:
+ tvar_pattern_safer = re.compile(r'(]*name=)[^>]*)(>)', re.IGNORECASE | re.DOTALL)
+
+ # We will utilise the simpler pattern, assuming the context is pre-processing before translation:
+ tvar_pattern_to_use = re.compile(r'()', re.DOTALL)
+
+ # Apply the substitution to all tags within the current block
+ modified_block = re.sub(
+ tvar_pattern_to_use,
+ substitute_simple_tvar,
+ full_block
+ )
+
+ return modified_block
+
+ # 2. Apply the block processor function to all blocks.
+ final_result = re.sub(
+ translate_pattern,
+ process_translate_block,
+ input_text
+ )
+
+ return final_result
\ No newline at end of file
diff --git a/translatable_wikitext_converter/wikitranslator_utils.py b/translatable_wikitext_converter/wikitranslator_utils.py
new file mode 100644
index 0000000..37860d8
--- /dev/null
+++ b/translatable_wikitext_converter/wikitranslator_utils.py
@@ -0,0 +1,221 @@
+# --- Utility Functions for Wikitext Conversion ---
+# This module contains helper functions that are used across the
+# wikitext conversion process. These functions handle tasks such as
+# capitalising text, checking for emojis, and wrapping text in
+# translation tags.
+
+import re, sys
+
+# Pattern to identify section headers (Level 2 or higher)
+SECTION_HEADER_PATTERN = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL)
+
+# --- Helper Functions for Processing Different Wikitext Elements ---
+# These functions are designed to handle specific wikitext structures.
+# Some will recursively call the main `convert_to_translatable_wikitext`
+# function to process their internal content, ensuring nested elements
+# are also handled correctly.
+
+def capitalise_first_letter(text):
+ """
+ Capitalises the first letter of the given text.
+ If the text is empty or consists only of whitespace, it returns the text unchanged.
+ """
+ if not text or not text.strip():
+ return text
+ return text[0].upper() + text[1:]
+
+def is_emoji_unicode(char):
+ # This is a very simplified set of common emoji ranges.
+ # A comprehensive list would be much longer and more complex.
+ # See https://www.unicode.org/Public/emoji/ for full details.
+ if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons
+ return True
+ if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs
+ return True
+ if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols
+ return True
+ if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols
+ return True
+ if 0x2700 <= ord(char) <= 0x27BF: # Dingbats
+ return True
+ # Add more ranges as needed for full coverage
+ return False
+
+def _wrap_in_translate(text):
+ """
+ Wraps the text with tags.
+ If the content starts or ends with a section header, it includes the preceding
+ or succeeding newline in the translation block.
+ """
+ if not text or not text.strip():
+ return text
+
+ # 1. Find the indices of the non-whitespace content
+ first_char_index = -1
+ last_char_index = -1
+
+ # We loop to find the first/last character that is NOT whitespace
+ for i, char in enumerate(text):
+ if char not in (' ', '\n', '\t', '\r', '\f', '\v'):
+ if first_char_index == -1:
+ first_char_index = i
+ last_char_index = i
+
+ if first_char_index == -1:
+ # If no non-whitespace characters are found, return the original text
+ return text
+
+ # Initial split
+ leading_whitespace = text[:first_char_index]
+ content = text[first_char_index : last_char_index + 1]
+ trailing_whitespace = text[last_char_index + 1 :]
+
+ # 2. Initial adjustment (To include the newline above the header)
+
+ # We check if the content starts with a section header
+ # (We use .match() on content to see if the header is at the very beginning)
+ match_start = SECTION_HEADER_PATTERN.match(content)
+
+ if match_start and leading_whitespace.endswith('\n'):
+ # If there is a header and the line above is a '\n', we move the '\n' from leading to content
+
+ # We subtract the '\n' from leading_whitespace
+ leading_whitespace = leading_whitespace[:-1]
+
+ # We recalculate content to include the preceding '\n'
+ content = text[first_char_index - 1 : last_char_index + 1]
+
+ # We update first_char_index for subsequent calculations (even if not used here)
+ first_char_index -= 1
+
+
+ # 3. Final adjustment (To include the newline below the header)
+
+ # We find the last match (to see if the header finishes the content block)
+ last_match = None
+ for m in SECTION_HEADER_PATTERN.finditer(content):
+ last_match = m
+
+ if last_match and last_match.end() == len(content) and trailing_whitespace.startswith('\n'):
+ # If the header is the last thing and the subsequent block starts with '\n', we include it
+
+ # We remove the '\n' from trailing_whitespace
+ trailing_whitespace = trailing_whitespace[1:]
+
+ # We extend content to include the subsequent '\n'
+ content = text[first_char_index : last_char_index + 2] # +2 because index is 0-based
+
+ # 4. Returning the result
+ return f"{leading_whitespace}{content} {trailing_whitespace}"
+
+############################################
+# Functions for Fixing Wiki Page Spacing #
+############################################
+
+def fix_section_title_spacing_internal(title: str) -> str:
+ """
+ Detects a section title and ensures there is exactly one space
+ between the '=' characters and the title text.
+ """
+ # Pattern: (={2,}) [optional space] (.+?) [optional space] \1
+ pattern = SECTION_HEADER_PATTERN
+
+ # Replacement: \1 [space] \2 [space] \1
+ return pattern.sub(r'\1 \2 \1', title)
+
+# --- Main Function to Fix Wiki Page Spacing ---
+
+def fix_wiki_page_spacing(wiki_text: str) -> str:
+ """
+ Applies the section title spacing fix and enforces consistent newlines
+ before (one blank line: \n\n) and after (one blank line: \n\n)
+ every section heading (Level 2 or higher).
+
+ This method guarantees the output format:
+ ...[Content]\n\n== Title ==\n\n[Next content]...
+
+ :param wiki_text: The full text of the wiki page.
+ :return: The corrected wiki page text.
+ """
+
+ # Pattern to match and replace a heading and its surrounding whitespace:
+ # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading.
+ # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading.
+ # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M).
+ # 4. [\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading.
+
+ # We use re.M (multiline) and re.DOTALL (dot matches newline)
+ heading_and_surroundings_pattern = re.compile(
+ r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL
+ )
+
+ def heading_replacer_full_format(match):
+ """
+ Callback function for re.sub that fixes spacing and enforces \n\n separation.
+ """
+ # Group 1: Content preceding the heading
+ content_before = match.group(1).rstrip()
+ # Group 2: The raw heading line
+ raw_heading = match.group(2)
+
+ # 1. Fix the internal spacing of the heading
+ corrected_heading = fix_section_title_spacing_internal(raw_heading)
+
+ # 2. Determine the prefix separator: \n\n
+ # If the heading is the first thing on the page (i.e., content_before is empty),
+ # we don't want to prepend \n\n. Otherwise, we do.
+ if content_before:
+ prefix = '\n\n'
+ else:
+ prefix = ''
+
+ # 3. The replacement structure:
+ # {Content Before}{Prefix}\n{Corrected Heading}\n\n
+ # The content that follows this match will immediately follow the final \n\n.
+ return f'{content_before}{prefix}{corrected_heading}\n\n'
+
+ # Apply the fix globally
+ corrected_text = heading_and_surroundings_pattern.sub(
+ heading_replacer_full_format,
+ wiki_text
+ )
+
+ # Clean up any residual excess newlines at the very beginning of the page
+ return corrected_text.lstrip('\n')
+
+# Aggiunto per permettere l'esecuzione del main
+if __name__ == '__main__':
+
+ # --- Dati di Test ---
+ # Contiene vari casi di spaziatura non corretta per le sezioni:
+ # 1. Spazi interni errati (sia troppi che mancanti).
+ # 2. Spazi esterni errati (troppe newline o nessuna newline).
+ # 3. Intestazione all'inizio della pagina (non deve avere \n\n prima).
+ # 4. Contenuto in mezzo.
+
+ test_wikitext = """
+
+== Ciao ==
+
+ciao
+== Ciao ==
+ciao
+== Ciao ==
+
+ciao
+"""
+
+ print("--- Test della funzione fix_wiki_page_spacing ---")
+ print("Testo Wiki Originale:\n" + "-"*30)
+ print(test_wikitext)
+ print("-" * 30)
+
+ # Esecuzione della funzione
+ corrected_wikitext = fix_wiki_page_spacing(test_wikitext)
+
+ print("\nTesto Wiki Corretto:\n" + "="*30)
+
+ # Usiamo repr() per mostrare chiaramente tutte le newline (\n) e gli spazi
+ print(corrected_wikitext)
+ print("=" * 30)
+
\ No newline at end of file