From 3f65d2398b6b3c506af4a0d425c49c6890af9af7 Mon Sep 17 00:00:00 2001 From: super-nabla Date: Sun, 14 Dec 2025 17:46:52 +0100 Subject: [PATCH 1/8] debug position --- app.py | 4 ++-- tests.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/app.py b/app.py index 0925546..79627c1 100644 --- a/app.py +++ b/app.py @@ -419,9 +419,9 @@ def process_double_brackets(text, tvar_id=0): # Assuming it's a regular internal link if len(parts) == 1: - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink if len(parts) == 2 : - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink return text def process_external_link(text, tvar_url_id=0): diff --git a/tests.py b/tests.py index 2112349..0bb4460 100644 --- a/tests.py +++ b/tests.py @@ -22,7 +22,7 @@ def test_internal_and_external_links(self): convert_to_translatable_wikitext( 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' ), - 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' + 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' ) def test_category_with_translation(self): @@ -40,7 +40,7 @@ def test_notoc_preserved(self): def test_simple_internal_link(self): self.assertEqual( convert_to_translatable_wikitext('[[link]]'), - '[[Special:MyLanguage/Link|link]]' + '[[Special:MyLanguage/Link|link]]' ) def test_multiline_text(self): @@ -62,7 +62,7 @@ def test_double_namespace_without_list_case_1(self): convert_to_translatable_wikitext( '[[Help]]ing' ), - '[[Special:MyLanguage/Help|Help]]ing' + '[[Special:MyLanguage/Help|Help]]ing' ) def test_double_namespace_without_list_case_2(self): @@ -70,7 +70,7 @@ def test_double_namespace_without_list_case_2(self): convert_to_translatable_wikitext( '[[Help]] ing' ), - '[[Special:MyLanguage/Help|Help]] ing' + '[[Special:MyLanguage/Help|Help]] ing' ) def test_template_simple(self): @@ -155,7 +155,7 @@ def test_image_with_upright(self): def test_multiple_elements_in_one_line(self): self.assertEqual( convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), - 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' + 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' ) def test_text_around_br_tag(self): From 78a2bbd261b00b097d57ef2341832b698b60709b Mon Sep 17 00:00:00 2001 From: super-nabla Date: Sun, 14 Dec 2025 18:34:25 +0100 Subject: [PATCH 2/8] make py scripts more modular --- app.py | 792 +--------------------------------------------- tests.py | 2 +- wikitranslator.py | 790 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 793 insertions(+), 791 deletions(-) create mode 100644 wikitranslator.py diff --git a/app.py b/app.py index 79627c1..af70954 100644 --- a/app.py +++ b/app.py @@ -1,799 +1,11 @@ from flask import Flask, request, render_template, jsonify from flask_cors import CORS # Import flask-cors -import re -from enum import Enum -import sys + +from wikitranslator import convert_to_translatable_wikitext app = Flask(__name__) CORS(app) # Enable CORS for all routes -behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] - -# --- Helper Functions for Processing Different Wikitext Elements --- -# These functions are designed to handle specific wikitext structures. -# Some will recursively call the main `convert_to_translatable_wikitext` -# function to process their internal content, ensuring nested elements -# are also handled correctly. - -def capitalise_first_letter(text): - """ - Capitalises the first letter of the given text. - If the text is empty or consists only of whitespace, it returns the text unchanged. - """ - if not text or not text.strip(): - return text - return text[0].upper() + text[1:] - -def is_emoji_unicode(char): - # This is a very simplified set of common emoji ranges. - # A comprehensive list would be much longer and more complex. - # See https://www.unicode.org/Public/emoji/ for full details. - if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons - return True - if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs - return True - if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols - return True - if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols - return True - if 0x2700 <= ord(char) <= 0x27BF: # Dingbats - return True - # Add more ranges as needed for full coverage - return False - -def _wrap_in_translate(text): - """ - Wraps the given text with tags. - It ensures that empty or whitespace-only strings are not wrapped. - The tags are added around the non-whitespace content, - preserving leading and trailing whitespace. - """ - if not text or not text.strip(): - return text - - # Find the first and last non-whitespace characters - first_char_index = -1 - last_char_index = -1 - for i, char in enumerate(text): - if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters - if first_char_index == -1: - first_char_index = i - last_char_index = i - - # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) - if first_char_index == -1: - return text - - leading_whitespace = text[:first_char_index] - content = text[first_char_index : last_char_index + 1] - trailing_whitespace = text[last_char_index + 1 :] - - return f"{leading_whitespace}{content}{trailing_whitespace}" - -def process_syntax_highlight(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('')), "Invalid syntax highlight tag" - # Get inside the tag - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_table(text): - """ - Processes table blocks in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('{|') and text.endswith('|}')), "Invalid table tag" - return text - -def process_blockquote(text): - """ - Processes blockquote tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('
') and text.endswith('
')), "Invalid blockquote tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_poem_tag(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('')), "Invalid poem tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_code_tag(text, tvar_code_id=0): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('')), "Invalid code tag" - # Get inside the tag - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = f'{content}' - return f"{prefix}{wrapped_content}{suffix}" - -def process_div(text): - """ - Processes
tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('')), "Invalid div tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_hiero(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('') and text.endswith('')), "Invalid hiero tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_sub_sup(text): - """ - Processes and tags in the wikitext. - It wraps the content in tags. - """ - assert((text.startswith('') and text.endswith('')) or - (text.startswith('') and text.endswith(''))), "Invalid sub/sup tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_math(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('') and text.endswith('')), "Invalid math tag" - return text - -def process_small_tag(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('') and text.endswith('')), "Invalid small tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_nowiki(text): - """ - Processes tags in the wikitext. - It wraps the content in tags. - """ - assert(text.startswith('') and text.endswith('')), "Invalid nowiki tag" - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" - -def process_item(text): - """ - Processes list items in the wikitext. - It wraps the content in tags. - """ - offset = 0 - if text.startswith(';'): - offset = 1 - elif text.startswith(':'): - offset = 1 - elif text.startswith('#'): - while text[offset] == '#': - offset += 1 - elif text.startswith('*'): - while text[offset] == '*': - offset += 1 - # Add translate tags around the item content - item_content = text[offset:].strip() - if not item_content: - return text - return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' - -class double_brackets_types(Enum): - wikilink = 1 - category = 2 - inline_icon = 3 - not_inline_icon_file = 4 - special = 5 - invalid_file = 6 - -def _process_file(s, tvar_inline_icon_id=0): - # Define keywords that should NOT be translated when found as parameters - NON_TRANSLATABLE_KEYWORDS = { - 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', - 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}' - } - NON_TRANSLATABLE_KEYWORDS_PREFIXES = { - 'link=', 'upright=', 'alt=' - } - NOT_INLINE_KEYWORDS = { - 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}' - } - file_aliases = ['File:', 'file:', 'Image:', 'image:'] - - tokens = [] - - inner_content = s[2:-2] # Remove the leading [[ and trailing ]] - tokens = inner_content.split('|') - tokens = [token.strip() for token in tokens] # Clean up whitespace around tokens - - # The first token shall start with a file alias - # e.g., "File:Example.jpg" or "Image:Example.png" - if not tokens or not tokens[0].startswith(tuple(file_aliases)): - return line, double_brackets_types.invalid_file - - # The first token is a file link - filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0] - tokens[0] = f'File:{filename}' - - # Substitute 'left' with {{dirstart}} - while 'left' in tokens: - tokens[tokens.index('left')] = '{{dirstart}}' - # Substitute 'right' with {{dirend}} - while 'right' in tokens: - tokens[tokens.index('right')] = '{{dirend}}' - - ############################ - # Managing inline icons - ############################# - is_inline_icon = True - for token in tokens: - if token in NOT_INLINE_KEYWORDS: - is_inline_icon = False - break - if is_inline_icon : - # Check if it contains 'alt=' followed by an emoji - for token in tokens[1:]: - if token.startswith('alt='): - alt_text = token[len('alt='):].strip() - if not any(is_emoji_unicode(char) for char in alt_text): - is_inline_icon = False - break - elif token not in NON_TRANSLATABLE_KEYWORDS: - is_inline_icon = False - break - elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): - is_inline_icon = False - break - - if is_inline_icon: - # return something like: [[File:smiley.png|alt=🙂]] - returnline = f'[[' + '|'.join(tokens) + ']]' - return returnline, double_brackets_types.inline_icon - - ############################ - # Managing general files - ############################# - - output_parts = [] - - # The first token is the file name (e.g., "File:Example.jpg") - # We substitute any occurrences of "Image:" with "File:" - output_parts.append(tokens[0]) - - pixel_regex = re.compile(r'\d+(?:x\d+)?px') # Matches pixel values like "100px" or "100x50px)" - for token in tokens[1:]: - # Check for 'alt=' - if token.startswith('alt='): - alt_text = token[len('alt='):].strip() - output_parts.append('alt='+_wrap_in_translate(alt_text)) - # Check if the token is a known non-translatable keyword - elif token in NON_TRANSLATABLE_KEYWORDS: - output_parts.append(token) - # If the token starts with a known non-translatable prefix, keep it as is - elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): - output_parts.append(token) - # If the token is a pixel value, keep it as is - elif pixel_regex.match(token): - output_parts.append(token) - # Otherwise, assume it's a caption or other translatable text - else: - output_parts.append(f"{token}") - - # Reconstruct the line with the transformed parts - returnline = '[[' + '|'.join(output_parts) + ']]' - return returnline, double_brackets_types.not_inline_icon_file - -def process_double_brackets(text, tvar_id=0): - """ - Processes internal links in the wikitext. - It wraps the content in tags. - """ - if not (text.startswith("[[") and text.endswith("]]")) : - print(f"Input >{text}< must be wrapped in double brackets [[ ]]") - sys.exit(1) - # Split the link into parts, handling both internal links and links with display text - - inner_wl = text[2:-2] # Remove the leading [[ and trailing ]] - parts = inner_wl.split('|') - - # part 0 - category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] - file_aliases = ['File:', 'file:', 'Image:', 'image:'] - - parts[0] = parts[0].strip() # Clean up the first part - # Check if the first part is a category or file alias - if parts[0].startswith(tuple(category_aliases)): - # Handle category links - cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0] - return f'[[Category:{cat_name}{{{{#translation:}}}}]]', double_brackets_types.category - elif parts[0].startswith(tuple(file_aliases)): - # Handle file links - return _process_file(text) - elif parts[0].startswith('Special:'): - # Handle special pages - return f'[[{parts[0]}]]', double_brackets_types.special - - # Assuming it's a regular internal link - if len(parts) == 1: - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink - if len(parts) == 2 : - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink - return text - -def process_external_link(text, tvar_url_id=0): - """ - Processes external links in the format [http://example.com Description] and ensures - that only the description part is wrapped in tags, leaving the URL untouched. - """ - match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) - - if match: - url_part = match.group(1) - description_part = match.group(2) - # Wrap only the description part in tags, leave the URL untouched - return f'[{url_part} {description_part}]' - return text - -def process_template(text): - """ - Processes the text to ensure that only the content outside of double curly braces {{ ... }} is wrapped in tags, - while preserving the template content inside the braces without translating it. - """ - assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag" - # Split the template content from the rest of the text - inner_content = text[2:-2].strip() # Remove the leading {{ and trailing }} - inner_content = capitalise_first_letter(inner_content) # Capitalise the first letter of the inner content - - # If the inner content is empty, return an empty string - if not inner_content : - return text - - # Wrap the inner content in tags - return '{{' + inner_content + '}}' - -def process_raw_url(text): - """ - Processes raw URLs in the wikitext. - It wraps the URL in tags. - """ - # This function assumes the text is a raw URL, e.g., "http://example.com" - # and wraps it in tags. - if not text.strip(): - return text - return text.strip() - - -# --- Main Tokenisation Logic --- - -def convert_to_translatable_wikitext(wikitext): - """ - Converts standard wikitext to translatable wikitext by wrapping - translatable text with tags, while preserving and - correctly handling special wikitext elements. - This function tokenizes the entire text, not line by line. - """ - if not wikitext: - return "" - - # add an extra newline at the beginning, useful to process items at the beginning of the text - wikitext = '\n' + wikitext - - parts = [] - last = 0 - curr = 0 - text_length = len(wikitext) - - while curr < text_length : - found = None - # Syntax highlight block - pattern = '', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) - curr = end_pos - last = curr - continue - # Table block - pattern = '{|' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('|}', curr) + len('|}') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_table)) - curr = end_pattern - last = curr - continue - # Blockquote - pattern = '
' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('
', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_blockquote)) - curr = end_pattern - last = curr - continue - # Poem tag - pattern = '', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_poem_tag)) - curr = end_pattern - last = curr - continue - # Code tag - pattern = '', curr) + len('
') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_code_tag)) - curr = end_pattern - last = curr - continue - # Div tag - pattern = '', curr) + len('
') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_div)) - curr = end_pattern - last = curr - continue - # Hiero tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_hiero)) - curr = end_pattern - last = curr - continue - # Sub tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_sub_sup)) - curr = end_pattern - last = curr - continue - # Sup tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_sub_sup)) - curr = end_pattern - last = curr - continue - # Math tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_math)) - curr = end_pattern - last = curr - continue - # Small tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_small_tag)) - curr = end_pattern - last = curr - continue - # Nowiki tag - pattern = '' - if wikitext.startswith(pattern, curr): - end_pattern = wikitext.find('', curr) + len('') - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], process_nowiki)) - curr = end_pattern - last = curr - continue - # br tag - patterns = ['
', '
', '
'] - for p in patterns: - if wikitext.startswith(p, curr): - end_pattern = curr + len(p) - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pattern], lambda x: x)) - curr = end_pattern - last = curr - found = True - break - if found: - continue - # Lists - patterns_newline = ['\n*', '\n#', '\n:', '\n;'] - if any(wikitext.startswith(p, curr) for p in patterns_newline) : - curr += 1 # Discard the newline character - parts.append((wikitext[last:curr], _wrap_in_translate)) - # Iterate through the list items - patterns = ['*', '#', ':', ';'] - while any(wikitext.startswith(p, curr) for p in patterns) : - end_pattern = wikitext.find('\n', curr) - if end_pattern == -1: - end_pattern = text_length - else : - end_pattern += 1 # Include the newline in the part - parts.append((wikitext[curr:end_pattern], process_item)) - curr = end_pattern - last = curr - continue - # Internal links - pattern = '[[' - if wikitext.startswith(pattern, curr): - # Count the number of opening double brackets '[[' and closing ']]' to find the end - end_pos = curr + 2 - bracket_count = 1 - while end_pos < text_length and bracket_count > 0: - if wikitext.startswith('[[', end_pos): - bracket_count += 1 - end_pos += 2 - elif wikitext.startswith(']]', end_pos): - bracket_count -= 1 - end_pos += 2 - else: - end_pos += 1 - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - if end_pos > curr + 2: # Ensure we have a valid link - parts.append((wikitext[curr:end_pos], process_double_brackets)) - curr = end_pos - last = curr - continue - # External links - pattern = '[http' - if wikitext.startswith(pattern, curr): - # Find the end of the external link - end_pos = wikitext.find(']', curr) - if end_pos == -1: - end_pos = text_length - else : - end_pos += 1 # Include the closing ']' in the part - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pos + 1], process_external_link)) - curr = end_pos - last = curr - continue - # Templates - pattern = '{{' - if wikitext.startswith(pattern, curr): - # Find the end of the template - end_pos = wikitext.find('}}', curr) + 2 - if end_pos == 1: - end_pos = text_length - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pos], process_template)) - curr = end_pos - last = curr - continue - # Raw URLs - pattern = 'http' - if wikitext.startswith(pattern, curr): - # Find the end of the URL (space or end of string) - end_pos = wikitext.find(' ', curr) - if end_pos == -1: - end_pos = text_length - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pos], process_raw_url)) - curr = end_pos - last = curr - continue - # Behaviour switches - for switch in behaviour_switches: - if wikitext.startswith(switch, curr): - end_pos = curr + len(switch) - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((wikitext[curr:end_pos], lambda x: x)) - curr = end_pos - last = curr - - - curr += 1 # Move to the next character if no pattern matched - - # Add any remaining text after the last processed part - if last < text_length: - parts.append((wikitext[last:], _wrap_in_translate)) - - """ - print ('*' * 20) - for i, (part, handler) in enumerate(parts): - print(f"--- Start element {i} with handler {handler.__name__} ---") - print(part) - print(f"---\n") - - print ('*' * 20) - """ - - # Process links - tvar_id = 0 - tvar_url_id = 0 - tvar_code_id = 0 - tvar_inline_icon_id = 0 - for i, (part, handler) in enumerate(parts): - # Handlers for links require a tvar_id - if handler == process_double_brackets: - new_part, double_brackets_type = handler(part, tvar_id) - if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - else : - new_handler = lambda x: x # No further processing for categories and files - parts[i] = (new_part, new_handler) - tvar_id += 1 - elif handler == process_external_link: - new_part = handler(part, tvar_url_id) - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - parts[i] = (new_part, new_handler) - tvar_url_id += 1 - elif handler == process_code_tag: - new_part = handler(part, tvar_code_id) - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - parts[i] = (new_part, new_handler) - tvar_code_id += 1 - elif handler == process_double_brackets : - new_part, double_brackets_type = handler(part, tvar_inline_icon_id) - if double_brackets_type == double_brackets_types.inline_icon: - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - tvar_inline_icon_id += 1 - else: - new_handler = lambda x: x - - # Scan again the parts: merge consecutive parts handled by _wrap_in_translate - _parts = [] - if parts: - current_part, current_handler = parts[0] - for part, handler in parts[1:]: - if handler == _wrap_in_translate and current_handler == _wrap_in_translate: - # Merge the parts - current_part += part - else: - _parts.append((current_part, current_handler)) - current_part, current_handler = part, handler - # Add the last accumulated part - _parts.append((current_part, current_handler)) - - # Process the parts with their respective handlers - processed_parts = [handler(part) for part, handler in _parts] - - # Debug output - """ - print("Processed parts:") - for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): - print(f"--- Start element {i} with handler {handler.__name__} ---") - print(part) - print(f"---\n") - print(ppart) - print(f"---\n") - """ - - # Join the processed parts into a single string - return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning - @app.route('/') def index(): return render_template('home.html') diff --git a/tests.py b/tests.py index 0bb4460..29d0e68 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,5 @@ import unittest -from app import convert_to_translatable_wikitext, process_double_brackets +from app import convert_to_translatable_wikitext class TestTranslatableWikitext(unittest.TestCase): diff --git a/wikitranslator.py b/wikitranslator.py new file mode 100644 index 0000000..3ff4997 --- /dev/null +++ b/wikitranslator.py @@ -0,0 +1,790 @@ +import re +from enum import Enum +import sys + +behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] + +# --- Helper Functions for Processing Different Wikitext Elements --- +# These functions are designed to handle specific wikitext structures. +# Some will recursively call the main `convert_to_translatable_wikitext` +# function to process their internal content, ensuring nested elements +# are also handled correctly. + +def capitalise_first_letter(text): + """ + Capitalises the first letter of the given text. + If the text is empty or consists only of whitespace, it returns the text unchanged. + """ + if not text or not text.strip(): + return text + return text[0].upper() + text[1:] + +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + +def _wrap_in_translate(text): + """ + Wraps the given text with tags. + It ensures that empty or whitespace-only strings are not wrapped. + The tags are added around the non-whitespace content, + preserving leading and trailing whitespace. + """ + if not text or not text.strip(): + return text + + # Find the first and last non-whitespace characters + first_char_index = -1 + last_char_index = -1 + for i, char in enumerate(text): + if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters + if first_char_index == -1: + first_char_index = i + last_char_index = i + + # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) + if first_char_index == -1: + return text + + leading_whitespace = text[:first_char_index] + content = text[first_char_index : last_char_index + 1] + trailing_whitespace = text[last_char_index + 1 :] + + return f"{leading_whitespace}{content}{trailing_whitespace}" + +def process_syntax_highlight(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid syntax highlight tag" + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_table(text): + """ + Processes table blocks in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('{|') and text.endswith('|}')), "Invalid table tag" + return text + +def process_blockquote(text): + """ + Processes blockquote tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('
') and text.endswith('
')), "Invalid blockquote tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_poem_tag(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid poem tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_code_tag(text, tvar_code_id=0): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid code tag" + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = f'{content}' + return f"{prefix}{wrapped_content}{suffix}" + +def process_div(text): + """ + Processes
tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid div tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_hiero(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid hiero tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_sub_sup(text): + """ + Processes and tags in the wikitext. + It wraps the content in tags. + """ + assert((text.startswith('') and text.endswith('')) or + (text.startswith('') and text.endswith(''))), "Invalid sub/sup tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_math(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid math tag" + return text + +def process_small_tag(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid small tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_nowiki(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid nowiki tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_item(text): + """ + Processes list items in the wikitext. + It wraps the content in tags. + """ + offset = 0 + if text.startswith(';'): + offset = 1 + elif text.startswith(':'): + offset = 1 + elif text.startswith('#'): + while text[offset] == '#': + offset += 1 + elif text.startswith('*'): + while text[offset] == '*': + offset += 1 + # Add translate tags around the item content + item_content = text[offset:].strip() + if not item_content: + return text + return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' + +class double_brackets_types(Enum): + wikilink = 1 + category = 2 + inline_icon = 3 + not_inline_icon_file = 4 + special = 5 + invalid_file = 6 + +def _process_file(s, tvar_inline_icon_id=0): + # Define keywords that should NOT be translated when found as parameters + NON_TRANSLATABLE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', + 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}' + } + NON_TRANSLATABLE_KEYWORDS_PREFIXES = { + 'link=', 'upright=', 'alt=' + } + NOT_INLINE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}' + } + file_aliases = ['File:', 'file:', 'Image:', 'image:'] + + tokens = [] + + inner_content = s[2:-2] # Remove the leading [[ and trailing ]] + tokens = inner_content.split('|') + tokens = [token.strip() for token in tokens] # Clean up whitespace around tokens + + # The first token shall start with a file alias + # e.g., "File:Example.jpg" or "Image:Example.png" + if not tokens or not tokens[0].startswith(tuple(file_aliases)): + return line, double_brackets_types.invalid_file + + # The first token is a file link + filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0] + tokens[0] = f'File:{filename}' + + # Substitute 'left' with {{dirstart}} + while 'left' in tokens: + tokens[tokens.index('left')] = '{{dirstart}}' + # Substitute 'right' with {{dirend}} + while 'right' in tokens: + tokens[tokens.index('right')] = '{{dirend}}' + + ############################ + # Managing inline icons + ############################# + is_inline_icon = True + for token in tokens: + if token in NOT_INLINE_KEYWORDS: + is_inline_icon = False + break + if is_inline_icon : + # Check if it contains 'alt=' followed by an emoji + for token in tokens[1:]: + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + if not any(is_emoji_unicode(char) for char in alt_text): + is_inline_icon = False + break + elif token not in NON_TRANSLATABLE_KEYWORDS: + is_inline_icon = False + break + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + is_inline_icon = False + break + + if is_inline_icon: + # return something like: [[File:smiley.png|alt=🙂]] + returnline = f'[[' + '|'.join(tokens) + ']]' + return returnline, double_brackets_types.inline_icon + + ############################ + # Managing general files + ############################# + + output_parts = [] + + # The first token is the file name (e.g., "File:Example.jpg") + # We substitute any occurrences of "Image:" with "File:" + output_parts.append(tokens[0]) + + pixel_regex = re.compile(r'\d+(?:x\d+)?px') # Matches pixel values like "100px" or "100x50px)" + for token in tokens[1:]: + # Check for 'alt=' + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + output_parts.append('alt='+_wrap_in_translate(alt_text)) + # Check if the token is a known non-translatable keyword + elif token in NON_TRANSLATABLE_KEYWORDS: + output_parts.append(token) + # If the token starts with a known non-translatable prefix, keep it as is + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + output_parts.append(token) + # If the token is a pixel value, keep it as is + elif pixel_regex.match(token): + output_parts.append(token) + # Otherwise, assume it's a caption or other translatable text + else: + output_parts.append(f"{token}") + + # Reconstruct the line with the transformed parts + returnline = '[[' + '|'.join(output_parts) + ']]' + return returnline, double_brackets_types.not_inline_icon_file + +def process_double_brackets(text, tvar_id=0): + """ + Processes internal links in the wikitext. + It wraps the content in tags. + """ + if not (text.startswith("[[") and text.endswith("]]")) : + print(f"Input >{text}< must be wrapped in double brackets [[ ]]") + sys.exit(1) + # Split the link into parts, handling both internal links and links with display text + + inner_wl = text[2:-2] # Remove the leading [[ and trailing ]] + parts = inner_wl.split('|') + + # part 0 + category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] + file_aliases = ['File:', 'file:', 'Image:', 'image:'] + + parts[0] = parts[0].strip() # Clean up the first part + # Check if the first part is a category or file alias + if parts[0].startswith(tuple(category_aliases)): + # Handle category links + cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0] + return f'[[Category:{cat_name}{{{{#translation:}}}}]]', double_brackets_types.category + elif parts[0].startswith(tuple(file_aliases)): + # Handle file links + return _process_file(text) + elif parts[0].startswith('Special:'): + # Handle special pages + return f'[[{parts[0]}]]', double_brackets_types.special + + # Assuming it's a regular internal link + if len(parts) == 1: + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink + if len(parts) == 2 : + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink + return text + +def process_external_link(text, tvar_url_id=0): + """ + Processes external links in the format [http://example.com Description] and ensures + that only the description part is wrapped in tags, leaving the URL untouched. + """ + match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) + + if match: + url_part = match.group(1) + description_part = match.group(2) + # Wrap only the description part in tags, leave the URL untouched + return f'[{url_part} {description_part}]' + return text + +def process_template(text): + """ + Processes the text to ensure that only the content outside of double curly braces {{ ... }} is wrapped in tags, + while preserving the template content inside the braces without translating it. + """ + assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag" + # Split the template content from the rest of the text + inner_content = text[2:-2].strip() # Remove the leading {{ and trailing }} + inner_content = capitalise_first_letter(inner_content) # Capitalise the first letter of the inner content + + # If the inner content is empty, return an empty string + if not inner_content : + return text + + # Wrap the inner content in tags + return '{{' + inner_content + '}}' + +def process_raw_url(text): + """ + Processes raw URLs in the wikitext. + It wraps the URL in tags. + """ + # This function assumes the text is a raw URL, e.g., "http://example.com" + # and wraps it in tags. + if not text.strip(): + return text + return text.strip() + + +# --- Main Tokenisation Logic --- + +def convert_to_translatable_wikitext(wikitext): + """ + Converts standard wikitext to translatable wikitext by wrapping + translatable text with tags, while preserving and + correctly handling special wikitext elements. + This function tokenizes the entire text, not line by line. + """ + if not wikitext: + return "" + + # add an extra newline at the beginning, useful to process items at the beginning of the text + wikitext = '\n' + wikitext + + parts = [] + last = 0 + curr = 0 + text_length = len(wikitext) + + while curr < text_length : + found = None + # Syntax highlight block + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) + curr = end_pos + last = curr + continue + # Table block + pattern = '{|' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('|}', curr) + len('|}') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_table)) + curr = end_pattern + last = curr + continue + # Blockquote + pattern = '
' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('
', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_blockquote)) + curr = end_pattern + last = curr + continue + # Poem tag + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_poem_tag)) + curr = end_pattern + last = curr + continue + # Code tag + pattern = '', curr) + len('
') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_code_tag)) + curr = end_pattern + last = curr + continue + # Div tag + pattern = '', curr) + len('
') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_div)) + curr = end_pattern + last = curr + continue + # Hiero tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_hiero)) + curr = end_pattern + last = curr + continue + # Sub tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Sup tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Math tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_math)) + curr = end_pattern + last = curr + continue + # Small tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_small_tag)) + curr = end_pattern + last = curr + continue + # Nowiki tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_nowiki)) + curr = end_pattern + last = curr + continue + # br tag + patterns = ['
', '
', '
'] + for p in patterns: + if wikitext.startswith(p, curr): + end_pattern = curr + len(p) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], lambda x: x)) + curr = end_pattern + last = curr + found = True + break + if found: + continue + # Lists + patterns_newline = ['\n*', '\n#', '\n:', '\n;'] + if any(wikitext.startswith(p, curr) for p in patterns_newline) : + curr += 1 # Discard the newline character + parts.append((wikitext[last:curr], _wrap_in_translate)) + # Iterate through the list items + patterns = ['*', '#', ':', ';'] + while any(wikitext.startswith(p, curr) for p in patterns) : + end_pattern = wikitext.find('\n', curr) + if end_pattern == -1: + end_pattern = text_length + else : + end_pattern += 1 # Include the newline in the part + parts.append((wikitext[curr:end_pattern], process_item)) + curr = end_pattern + last = curr + continue + # Internal links + pattern = '[[' + if wikitext.startswith(pattern, curr): + # Count the number of opening double brackets '[[' and closing ']]' to find the end + end_pos = curr + 2 + bracket_count = 1 + while end_pos < text_length and bracket_count > 0: + if wikitext.startswith('[[', end_pos): + bracket_count += 1 + end_pos += 2 + elif wikitext.startswith(']]', end_pos): + bracket_count -= 1 + end_pos += 2 + else: + end_pos += 1 + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + if end_pos > curr + 2: # Ensure we have a valid link + parts.append((wikitext[curr:end_pos], process_double_brackets)) + curr = end_pos + last = curr + continue + # External links + pattern = '[http' + if wikitext.startswith(pattern, curr): + # Find the end of the external link + end_pos = wikitext.find(']', curr) + if end_pos == -1: + end_pos = text_length + else : + end_pos += 1 # Include the closing ']' in the part + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos + 1], process_external_link)) + curr = end_pos + last = curr + continue + # Templates + pattern = '{{' + if wikitext.startswith(pattern, curr): + # Find the end of the template + end_pos = wikitext.find('}}', curr) + 2 + if end_pos == 1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_template)) + curr = end_pos + last = curr + continue + # Raw URLs + pattern = 'http' + if wikitext.startswith(pattern, curr): + # Find the end of the URL (space or end of string) + end_pos = wikitext.find(' ', curr) + if end_pos == -1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_raw_url)) + curr = end_pos + last = curr + continue + # Behaviour switches + for switch in behaviour_switches: + if wikitext.startswith(switch, curr): + end_pos = curr + len(switch) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], lambda x: x)) + curr = end_pos + last = curr + + + curr += 1 # Move to the next character if no pattern matched + + # Add any remaining text after the last processed part + if last < text_length: + parts.append((wikitext[last:], _wrap_in_translate)) + + """ + print ('*' * 20) + for i, (part, handler) in enumerate(parts): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + + print ('*' * 20) + """ + + # Process links + tvar_id = 0 + tvar_url_id = 0 + tvar_code_id = 0 + tvar_inline_icon_id = 0 + for i, (part, handler) in enumerate(parts): + # Handlers for links require a tvar_id + if handler == process_double_brackets: + new_part, double_brackets_type = handler(part, tvar_id) + if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + else : + new_handler = lambda x: x # No further processing for categories and files + parts[i] = (new_part, new_handler) + tvar_id += 1 + elif handler == process_external_link: + new_part = handler(part, tvar_url_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) + tvar_url_id += 1 + elif handler == process_code_tag: + new_part = handler(part, tvar_code_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) + tvar_code_id += 1 + elif handler == process_double_brackets : + new_part, double_brackets_type = handler(part, tvar_inline_icon_id) + if double_brackets_type == double_brackets_types.inline_icon: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + tvar_inline_icon_id += 1 + else: + new_handler = lambda x: x + + # Scan again the parts: merge consecutive parts handled by _wrap_in_translate + _parts = [] + if parts: + current_part, current_handler = parts[0] + for part, handler in parts[1:]: + if handler == _wrap_in_translate and current_handler == _wrap_in_translate: + # Merge the parts + current_part += part + else: + _parts.append((current_part, current_handler)) + current_part, current_handler = part, handler + # Add the last accumulated part + _parts.append((current_part, current_handler)) + + # Process the parts with their respective handlers + processed_parts = [handler(part) for part, handler in _parts] + + # Debug output + """ + print("Processed parts:") + for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + print(ppart) + print(f"---\n") + """ + + # Join the processed parts into a single string + return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning \ No newline at end of file From 37195aa7c5a72dedc30be004a2062c939c2f5bcf Mon Sep 17 00:00:00 2001 From: super-nabla Date: Sun, 14 Dec 2025 22:31:05 +0100 Subject: [PATCH 3/8] change project structure --- pyproject.toml | 12 ++ translatable_wikitext_converter/__init__.py | 0 .../app.py | 2 +- translatable_wikitext_converter/butta.py | 113 +++++++++++ .../templates}/home.html | 0 .../tests.py | 7 +- .../wikitranslator.py | 74 ++------ .../wikitranslator_utils.py | 177 ++++++++++++++++++ 8 files changed, 318 insertions(+), 67 deletions(-) create mode 100644 pyproject.toml create mode 100644 translatable_wikitext_converter/__init__.py rename app.py => translatable_wikitext_converter/app.py (96%) create mode 100644 translatable_wikitext_converter/butta.py rename {templates => translatable_wikitext_converter/templates}/home.html (100%) rename tests.py => translatable_wikitext_converter/tests.py (96%) rename wikitranslator.py => translatable_wikitext_converter/wikitranslator.py (92%) create mode 100644 translatable_wikitext_converter/wikitranslator_utils.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..397eacb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "translatable-wikitext-converter" +version = "0.1.0" +description = "Convert wikitext into translatable wikitext" +authors = [ + { name = "Gopa Vasanth" } +] +requires-python = ">=3.9" diff --git a/translatable_wikitext_converter/__init__.py b/translatable_wikitext_converter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app.py b/translatable_wikitext_converter/app.py similarity index 96% rename from app.py rename to translatable_wikitext_converter/app.py index af70954..357aee5 100644 --- a/app.py +++ b/translatable_wikitext_converter/app.py @@ -1,7 +1,7 @@ from flask import Flask, request, render_template, jsonify from flask_cors import CORS # Import flask-cors -from wikitranslator import convert_to_translatable_wikitext +from .wikitranslator import convert_to_translatable_wikitext app = Flask(__name__) CORS(app) # Enable CORS for all routes diff --git a/translatable_wikitext_converter/butta.py b/translatable_wikitext_converter/butta.py new file mode 100644 index 0000000..6cd3995 --- /dev/null +++ b/translatable_wikitext_converter/butta.py @@ -0,0 +1,113 @@ +import re + +def fix_section_title_spacing_internal(title: str) -> str: + """ + Detects a section title and ensures there is exactly one space + between the '=' characters and the title text. + """ + # Pattern: (={2,}) [optional space] (.+?) [optional space] \1 + pattern = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + + # Replacement: \1 [space] \2 [space] \1 + return pattern.sub(r'\1 \2 \1', title) + + + +# --- Main Function to Fix Wiki Page Spacing --- + +def fix_wiki_page_spacing(wiki_text: str) -> str: + """ + Applies the section title spacing fix and enforces consistent newlines + before (one blank line: \n\n) and after (one blank line: \n\n) + every section heading (Level 2 or higher). + + This method guarantees the output format: + ...[Content]\n\n== Title ==\n\n[Next content]... + + :param wiki_text: The full text of the wiki page. + :return: The corrected wiki page text. + """ + + # Pattern to match and replace a heading and its surrounding whitespace: + # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading. + # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading. + # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M). + # 4. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading. + + # We use re.M (multiline) and re.DOTALL (dot matches newline) + heading_and_surroundings_pattern = re.compile( + r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL + ) + + def heading_replacer_full_format(match): + """ + Callback function for re.sub that fixes spacing and enforces \n\n separation. + """ + # Group 1: Content preceding the heading + content_before = match.group(1).rstrip() + # Group 2: The raw heading line + raw_heading = match.group(2) + + # 1. Fix the internal spacing of the heading + corrected_heading = fix_section_title_spacing_internal(raw_heading) + + # 2. Determine the prefix separator: \n\n + # If the heading is the first thing on the page (i.e., content_before is empty), + # we don't want to prepend \n\n. Otherwise, we do. + if content_before: + prefix = '\n\n' + else: + prefix = '' + + # 3. The replacement structure: + # {Content Before}{Prefix}\n{Corrected Heading}\n\n + # The content that follows this match will immediately follow the final \n\n. + return f'{content_before}{prefix}{corrected_heading}\n\n' + + # Apply the fix globally + corrected_text = heading_and_surroundings_pattern.sub( + heading_replacer_full_format, + wiki_text + ) + + # Clean up any residual excess newlines at the very beginning of the page + return corrected_text.lstrip('\r\n') + + +def main(): + """Hard-coded wiki page text for testing and debugging.""" + + # Text demonstrates various input issues: + # 1. Title 1: No internal space, no newline after content. (Needs \n\n before and after) + # 2. Title 2: Too much internal space, one newline after content. + # 3. Title 3: Correct internal space, three newlines after content. + # 4. Title 4: Starts immediately after content (missing newline before). + + raw_wiki_page_text = ( + "This is the header text.\n" + "This is the last line of the header.\n" # Content before first heading + "==Topic1==\n\n\n" # Missing \n before, too many \n after + "Content for topic 1.\n" + "Content continues...\n" + "=== Topic2 ===\n" # Missing \n before, one \n after + "Content for topic 2.\n" + "== Topic3 ==\n\n\n" + "Content for topic 3. Correct space, too many \n after.\n" + "Some more content.\n" + "====Topic4====\n" # Missing \n before, missing \n after + "Final content." + ) + + print("--- Original Wiki Page Text ---\n") + print(raw_wiki_page_text) + print("\n" + "="*60 + "\n") + + corrected_text = fix_wiki_page_spacing(raw_wiki_page_text) + + print("--- Corrected Wiki Page Text (Enforcing: \n\n== Title ==\n\n) ---\n") + print(corrected_text) + print("\n" + "="*60 + "\n") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/templates/home.html b/translatable_wikitext_converter/templates/home.html similarity index 100% rename from templates/home.html rename to translatable_wikitext_converter/templates/home.html diff --git a/tests.py b/translatable_wikitext_converter/tests.py similarity index 96% rename from tests.py rename to translatable_wikitext_converter/tests.py index 29d0e68..b7d079a 100644 --- a/tests.py +++ b/translatable_wikitext_converter/tests.py @@ -1,12 +1,13 @@ import unittest -from app import convert_to_translatable_wikitext + +from translatable_wikitext_converter.app import convert_to_translatable_wikitext class TestTranslatableWikitext(unittest.TestCase): def test_section_headers(self): self.assertEqual( convert_to_translatable_wikitext("==HELLO=="), - "==HELLO==" # Removed the \n\n that was expected + "== HELLO ==" # Removed the \n\n that was expected ) def test_file_tag_translations(self): @@ -46,7 +47,7 @@ def test_simple_internal_link(self): def test_multiline_text(self): self.assertEqual( convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), - '\nhi iam charan\n
\nhappy\n\n' + 'hi iam charan\n
\nhappy\n\n' ) def test_double_namespace_processing(self): diff --git a/wikitranslator.py b/translatable_wikitext_converter/wikitranslator.py similarity index 92% rename from wikitranslator.py rename to translatable_wikitext_converter/wikitranslator.py index 3ff4997..80e57e7 100644 --- a/wikitranslator.py +++ b/translatable_wikitext_converter/wikitranslator.py @@ -1,69 +1,14 @@ -import re from enum import Enum -import sys +import re, sys -behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] - -# --- Helper Functions for Processing Different Wikitext Elements --- -# These functions are designed to handle specific wikitext structures. -# Some will recursively call the main `convert_to_translatable_wikitext` -# function to process their internal content, ensuring nested elements -# are also handled correctly. +from .wikitranslator_utils import ( + capitalise_first_letter, + is_emoji_unicode, + fix_wiki_page_spacing, + _wrap_in_translate +) -def capitalise_first_letter(text): - """ - Capitalises the first letter of the given text. - If the text is empty or consists only of whitespace, it returns the text unchanged. - """ - if not text or not text.strip(): - return text - return text[0].upper() + text[1:] - -def is_emoji_unicode(char): - # This is a very simplified set of common emoji ranges. - # A comprehensive list would be much longer and more complex. - # See https://www.unicode.org/Public/emoji/ for full details. - if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons - return True - if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs - return True - if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols - return True - if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols - return True - if 0x2700 <= ord(char) <= 0x27BF: # Dingbats - return True - # Add more ranges as needed for full coverage - return False - -def _wrap_in_translate(text): - """ - Wraps the given text with tags. - It ensures that empty or whitespace-only strings are not wrapped. - The tags are added around the non-whitespace content, - preserving leading and trailing whitespace. - """ - if not text or not text.strip(): - return text - - # Find the first and last non-whitespace characters - first_char_index = -1 - last_char_index = -1 - for i, char in enumerate(text): - if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters - if first_char_index == -1: - first_char_index = i - last_char_index = i - - # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) - if first_char_index == -1: - return text - - leading_whitespace = text[:first_char_index] - content = text[first_char_index : last_char_index + 1] - trailing_whitespace = text[last_char_index + 1 :] - - return f"{leading_whitespace}{content}{trailing_whitespace}" +behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] def process_syntax_highlight(text): """ @@ -474,6 +419,9 @@ def convert_to_translatable_wikitext(wikitext): if not wikitext: return "" + wikitext = wikitext.replace('\r\n', '\n').replace('\r', '\n') + wikitext = fix_wiki_page_spacing(wikitext) + # add an extra newline at the beginning, useful to process items at the beginning of the text wikitext = '\n' + wikitext diff --git a/translatable_wikitext_converter/wikitranslator_utils.py b/translatable_wikitext_converter/wikitranslator_utils.py new file mode 100644 index 0000000..fc70c96 --- /dev/null +++ b/translatable_wikitext_converter/wikitranslator_utils.py @@ -0,0 +1,177 @@ +# --- Utility Functions for Wikitext Conversion --- +# This module contains helper functions that are used across the +# wikitext conversion process. These functions handle tasks such as +# capitalising text, checking for emojis, and wrapping text in +# translation tags. + +import re, sys + +# --- Helper Functions for Processing Different Wikitext Elements --- +# These functions are designed to handle specific wikitext structures. +# Some will recursively call the main `convert_to_translatable_wikitext` +# function to process their internal content, ensuring nested elements +# are also handled correctly. + +def capitalise_first_letter(text): + """ + Capitalises the first letter of the given text. + If the text is empty or consists only of whitespace, it returns the text unchanged. + """ + if not text or not text.strip(): + return text + return text[0].upper() + text[1:] + +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + +def _wrap_in_translate(text): + """ + Wraps the given text with tags, preserving leading/trailing whitespace. + """ + if not text or not text.strip(): + return text + + # Logic for finding non-whitespace content (as defined in your current code) + first_char_index = -1 + last_char_index = -1 + for i, char in enumerate(text): + if char not in (' ', '\n', '\t', '\r', '\f', '\v'): + if first_char_index == -1: + first_char_index = i + last_char_index = i + + if first_char_index == -1: + return text + + leading_whitespace = text[:first_char_index] + content = text[first_char_index : last_char_index + 1] + trailing_whitespace = text[last_char_index + 1 :] + + return f"{leading_whitespace}{content}{trailing_whitespace}" + + +############################################ +# Functions for Fixing Wiki Page Spacing # +############################################ + +def fix_section_title_spacing_internal(title: str) -> str: + """ + Detects a section title and ensures there is exactly one space + between the '=' characters and the title text. + """ + # Pattern: (={2,}) [optional space] (.+?) [optional space] \1 + pattern = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + + # Replacement: \1 [space] \2 [space] \1 + return pattern.sub(r'\1 \2 \1', title) + +# --- Main Function to Fix Wiki Page Spacing --- + +def fix_wiki_page_spacing(wiki_text: str) -> str: + """ + Applies the section title spacing fix and enforces consistent newlines + before (one blank line: \n\n) and after (one blank line: \n\n) + every section heading (Level 2 or higher). + + This method guarantees the output format: + ...[Content]\n\n== Title ==\n\n[Next content]... + + :param wiki_text: The full text of the wiki page. + :return: The corrected wiki page text. + """ + + # Pattern to match and replace a heading and its surrounding whitespace: + # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading. + # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading. + # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M). + # 4. [\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading. + + # We use re.M (multiline) and re.DOTALL (dot matches newline) + heading_and_surroundings_pattern = re.compile( + r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL + ) + + def heading_replacer_full_format(match): + """ + Callback function for re.sub that fixes spacing and enforces \n\n separation. + """ + # Group 1: Content preceding the heading + content_before = match.group(1).rstrip() + # Group 2: The raw heading line + raw_heading = match.group(2) + + # 1. Fix the internal spacing of the heading + corrected_heading = fix_section_title_spacing_internal(raw_heading) + + # 2. Determine the prefix separator: \n\n + # If the heading is the first thing on the page (i.e., content_before is empty), + # we don't want to prepend \n\n. Otherwise, we do. + if content_before: + prefix = '\n\n' + else: + prefix = '' + + # 3. The replacement structure: + # {Content Before}{Prefix}\n{Corrected Heading}\n\n + # The content that follows this match will immediately follow the final \n\n. + return f'{content_before}{prefix}{corrected_heading}\n\n' + + # Apply the fix globally + corrected_text = heading_and_surroundings_pattern.sub( + heading_replacer_full_format, + wiki_text + ) + + # Clean up any residual excess newlines at the very beginning of the page + return corrected_text.lstrip('\n') + +# Aggiunto per permettere l'esecuzione del main +if __name__ == '__main__': + + # --- Dati di Test --- + # Contiene vari casi di spaziatura non corretta per le sezioni: + # 1. Spazi interni errati (sia troppi che mancanti). + # 2. Spazi esterni errati (troppe newline o nessuna newline). + # 3. Intestazione all'inizio della pagina (non deve avere \n\n prima). + # 4. Contenuto in mezzo. + + test_wikitext = """ + +== Ciao == + +ciao +== Ciao == +ciao +== Ciao == + +ciao +""" + + print("--- Test della funzione fix_wiki_page_spacing ---") + print("Testo Wiki Originale:\n" + "-"*30) + print(test_wikitext) + print("-" * 30) + + # Esecuzione della funzione + corrected_wikitext = fix_wiki_page_spacing(test_wikitext) + + print("\nTesto Wiki Corretto:\n" + "="*30) + + # Usiamo repr() per mostrare chiaramente tutte le newline (\n) e gli spazi + print(corrected_wikitext) + print("=" * 30) + \ No newline at end of file From 9b45ccbbb6111ba7124d834cc306f4353576191f Mon Sep 17 00:00:00 2001 From: super-nabla Date: Sun, 14 Dec 2025 22:48:08 +0100 Subject: [PATCH 4/8] fix spacing --- .../wikitranslator.py | 12 ++--- .../wikitranslator_utils.py | 52 +++++++++++++++++-- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/translatable_wikitext_converter/wikitranslator.py b/translatable_wikitext_converter/wikitranslator.py index 80e57e7..e75533d 100644 --- a/translatable_wikitext_converter/wikitranslator.py +++ b/translatable_wikitext_converter/wikitranslator.py @@ -724,15 +724,15 @@ def convert_to_translatable_wikitext(wikitext): processed_parts = [handler(part) for part, handler in _parts] # Debug output - """ + #""" print("Processed parts:") for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): print(f"--- Start element {i} with handler {handler.__name__} ---") - print(part) + print(f"@{part}@") print(f"---\n") - print(ppart) + print(f'@{ppart}@') print(f"---\n") - """ + #""" - # Join the processed parts into a single string - return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning \ No newline at end of file + # Join the processed parts into a single string and remove extra leading newline + return ''.join(processed_parts).lstrip('\n') \ No newline at end of file diff --git a/translatable_wikitext_converter/wikitranslator_utils.py b/translatable_wikitext_converter/wikitranslator_utils.py index fc70c96..37860d8 100644 --- a/translatable_wikitext_converter/wikitranslator_utils.py +++ b/translatable_wikitext_converter/wikitranslator_utils.py @@ -6,6 +6,9 @@ import re, sys +# Pattern to identify section headers (Level 2 or higher) +SECTION_HEADER_PATTERN = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + # --- Helper Functions for Processing Different Wikitext Elements --- # These functions are designed to handle specific wikitext structures. # Some will recursively call the main `convert_to_translatable_wikitext` @@ -40,14 +43,18 @@ def is_emoji_unicode(char): def _wrap_in_translate(text): """ - Wraps the given text with tags, preserving leading/trailing whitespace. + Wraps the text with tags. + If the content starts or ends with a section header, it includes the preceding + or succeeding newline in the translation block. """ if not text or not text.strip(): return text - # Logic for finding non-whitespace content (as defined in your current code) + # 1. Find the indices of the non-whitespace content first_char_index = -1 last_char_index = -1 + + # We loop to find the first/last character that is NOT whitespace for i, char in enumerate(text): if char not in (' ', '\n', '\t', '\r', '\f', '\v'): if first_char_index == -1: @@ -55,14 +62,51 @@ def _wrap_in_translate(text): last_char_index = i if first_char_index == -1: + # If no non-whitespace characters are found, return the original text return text + # Initial split leading_whitespace = text[:first_char_index] content = text[first_char_index : last_char_index + 1] trailing_whitespace = text[last_char_index + 1 :] + + # 2. Initial adjustment (To include the newline above the header) + + # We check if the content starts with a section header + # (We use .match() on content to see if the header is at the very beginning) + match_start = SECTION_HEADER_PATTERN.match(content) + + if match_start and leading_whitespace.endswith('\n'): + # If there is a header and the line above is a '\n', we move the '\n' from leading to content + + # We subtract the '\n' from leading_whitespace + leading_whitespace = leading_whitespace[:-1] + + # We recalculate content to include the preceding '\n' + content = text[first_char_index - 1 : last_char_index + 1] + + # We update first_char_index for subsequent calculations (even if not used here) + first_char_index -= 1 - return f"{leading_whitespace}{content}{trailing_whitespace}" + # 3. Final adjustment (To include the newline below the header) + + # We find the last match (to see if the header finishes the content block) + last_match = None + for m in SECTION_HEADER_PATTERN.finditer(content): + last_match = m + + if last_match and last_match.end() == len(content) and trailing_whitespace.startswith('\n'): + # If the header is the last thing and the subsequent block starts with '\n', we include it + + # We remove the '\n' from trailing_whitespace + trailing_whitespace = trailing_whitespace[1:] + + # We extend content to include the subsequent '\n' + content = text[first_char_index : last_char_index + 2] # +2 because index is 0-based + + # 4. Returning the result + return f"{leading_whitespace}{content}{trailing_whitespace}" ############################################ # Functions for Fixing Wiki Page Spacing # @@ -74,7 +118,7 @@ def fix_section_title_spacing_internal(title: str) -> str: between the '=' characters and the title text. """ # Pattern: (={2,}) [optional space] (.+?) [optional space] \1 - pattern = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + pattern = SECTION_HEADER_PATTERN # Replacement: \1 [space] \2 [space] \1 return pattern.sub(r'\1 \2 \1', title) From 1ff39b3435f76c50b6590ba3ccd7e85461016b5c Mon Sep 17 00:00:00 2001 From: super-nabla Date: Sun, 14 Dec 2025 23:14:18 +0100 Subject: [PATCH 5/8] fix tests --- translatable_wikitext_converter/tests.py | 10 +++++----- translatable_wikitext_converter/wikitranslator.py | 11 +++++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/translatable_wikitext_converter/tests.py b/translatable_wikitext_converter/tests.py index b7d079a..32b3fcb 100644 --- a/translatable_wikitext_converter/tests.py +++ b/translatable_wikitext_converter/tests.py @@ -7,7 +7,7 @@ class TestTranslatableWikitext(unittest.TestCase): def test_section_headers(self): self.assertEqual( convert_to_translatable_wikitext("==HELLO=="), - "== HELLO ==" # Removed the \n\n that was expected + """\n== HELLO ==\n""" ) def test_file_tag_translations(self): @@ -47,7 +47,7 @@ def test_simple_internal_link(self): def test_multiline_text(self): self.assertEqual( convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), - 'hi iam charan\n
\nhappy\n\n' + 'hi iam charan\n
\nhappy' ) def test_double_namespace_processing(self): @@ -174,19 +174,19 @@ def test_empty_string_input(self): def test_whitespace_only_input(self): self.assertEqual( convert_to_translatable_wikitext(" \n\t "), - " \n\t " + "\t" ) def test_list_items(self): self.assertEqual( convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"), - "* Item 1\n** Sub-item 1.1\n* Item 2\n" + "* Item 1\n** Sub-item 1.1\n* Item 2" ) def test_definition_list(self): self.assertEqual( convert_to_translatable_wikitext(";Term\n:Definition\n:Description"), - "; Term\n: Definition\n: Description\n" + "; Term\n: Definition\n: Description" ) if __name__ == '__main__': diff --git a/translatable_wikitext_converter/wikitranslator.py b/translatable_wikitext_converter/wikitranslator.py index e75533d..002c316 100644 --- a/translatable_wikitext_converter/wikitranslator.py +++ b/translatable_wikitext_converter/wikitranslator.py @@ -734,5 +734,12 @@ def convert_to_translatable_wikitext(wikitext): print(f"---\n") #""" - # Join the processed parts into a single string and remove extra leading newline - return ''.join(processed_parts).lstrip('\n') \ No newline at end of file + # Join the processed parts into a single string + out_wikitext = ''.join(processed_parts) + + # Keep removing all trailing and leading newlines and spaces + while out_wikitext.startswith('\n') or out_wikitext.startswith(' ') or out_wikitext.endswith('\n') or out_wikitext.endswith(' '): + out_wikitext = out_wikitext.strip('\n') + out_wikitext = out_wikitext.strip(' ') + + return out_wikitext \ No newline at end of file From 304f6a893e607a18c3be5121dddb4b1485d9d7f9 Mon Sep 17 00:00:00 2001 From: super-nabla Date: Mon, 15 Dec 2025 01:32:28 +0100 Subject: [PATCH 6/8] update wikilink logic --- translatable_wikitext_converter/app.py | 8 +- translatable_wikitext_converter/tests.py | 119 +++++-- .../wikitranslator.py | 328 +++++++++++++++--- 3 files changed, 371 insertions(+), 84 deletions(-) diff --git a/translatable_wikitext_converter/app.py b/translatable_wikitext_converter/app.py index 357aee5..5ed33e1 100644 --- a/translatable_wikitext_converter/app.py +++ b/translatable_wikitext_converter/app.py @@ -1,7 +1,7 @@ from flask import Flask, request, render_template, jsonify from flask_cors import CORS # Import flask-cors -from .wikitranslator import convert_to_translatable_wikitext +from .wikitranslator import tag_for_translation app = Flask(__name__) CORS(app) # Enable CORS for all routes @@ -17,8 +17,8 @@ def redirect_to_home(): @app.route('/convert', methods=['POST']) def convert(): wikitext = request.form.get('wikitext', '') - converted_text = convert_to_translatable_wikitext(wikitext) - return render_template('home.html', original=wikitext, converted=converted_text) + tagged = tag_for_translation(wikitext) + return render_template('home.html', original=wikitext, converted=tagged) @app.route('/api/convert', methods=['GET', 'POST']) def api_convert(): @@ -47,4 +47,4 @@ def api_convert(): }) if __name__ == '__main__': - app.run(debug=True) + app.run(debug=True, port=5001) diff --git a/translatable_wikitext_converter/tests.py b/translatable_wikitext_converter/tests.py index 32b3fcb..00226cb 100644 --- a/translatable_wikitext_converter/tests.py +++ b/translatable_wikitext_converter/tests.py @@ -1,18 +1,18 @@ import unittest -from translatable_wikitext_converter.app import convert_to_translatable_wikitext +from translatable_wikitext_converter.app import tag_for_translation class TestTranslatableWikitext(unittest.TestCase): def test_section_headers(self): self.assertEqual( - convert_to_translatable_wikitext("==HELLO=="), + tag_for_translation("==HELLO=="), """\n== HELLO ==\n""" ) def test_file_tag_translations(self): self.assertEqual( - convert_to_translatable_wikitext( + tag_for_translation( '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]' ), '[[File:landscape.jpg|thumb|{{dirstart}}|alt=sunset|Photo of a beautiful landscape]]' @@ -20,174 +20,219 @@ def test_file_tag_translations(self): def test_internal_and_external_links(self): self.assertEqual( - convert_to_translatable_wikitext( + tag_for_translation( 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' ), - 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' + 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' ) def test_category_with_translation(self): self.assertEqual( - convert_to_translatable_wikitext("[[Category:Wikipedia]]"), + tag_for_translation("[[Category:Wikipedia]]"), "[[Category:Wikipedia{{#translation:}}]]" ) def test_notoc_preserved(self): self.assertEqual( - convert_to_translatable_wikitext("__NOTOC__"), + tag_for_translation("__NOTOC__"), "__NOTOC__" ) def test_simple_internal_link(self): self.assertEqual( - convert_to_translatable_wikitext('[[link]]'), - '[[Special:MyLanguage/Link|link]]' + tag_for_translation('[[link]]'), + '[[Special:MyLanguage/Link|link]]' ) def test_multiline_text(self): self.assertEqual( - convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), + tag_for_translation('\nhi iam charan\n
\nhappy\n\n'), 'hi iam charan\n
\nhappy' ) def test_double_namespace_processing(self): self.assertEqual( - convert_to_translatable_wikitext( + tag_for_translation( '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ), - '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' + '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ) def test_double_namespace_without_list_case_1(self): self.assertEqual( - convert_to_translatable_wikitext( + tag_for_translation( '[[Help]]ing' ), - '[[Special:MyLanguage/Help|Help]]ing' + '[[Special:MyLanguage/Help|Help]]ing' ) def test_double_namespace_without_list_case_2(self): self.assertEqual( - convert_to_translatable_wikitext( + tag_for_translation( '[[Help]] ing' ), - '[[Special:MyLanguage/Help|Help]] ing' + '[[Special:MyLanguage/Help|Help]] ing' ) def test_template_simple(self): self.assertEqual( - convert_to_translatable_wikitext("{{Template Name}}"), + tag_for_translation("{{Template Name}}"), "{{Template Name}}" ) def test_template_with_parameters(self): self.assertEqual( - convert_to_translatable_wikitext("{{Template|param1=Value 1|Value 2}}"), + tag_for_translation("{{Template|param1=Value 1|Value 2}}"), "{{Template|param1=Value 1|Value 2}}" ) def test_template_nested_in_text(self): self.assertEqual( - convert_to_translatable_wikitext('Some text with {{a template here}} and more text.'), + tag_for_translation('Some text with {{a template here}} and more text.'), 'Some text with {{A template here}} and more text.' ) def test_nowiki_tag(self): self.assertEqual( - convert_to_translatable_wikitext("Some text with [[Raw link]] content."), + tag_for_translation("Some text with [[Raw link]] content."), "Some text with [[Raw link]] content." ) def test_blockquote_tag(self): self.assertEqual( - convert_to_translatable_wikitext("
This is a quote.
"), + tag_for_translation("
This is a quote.
"), "
This is a quote.
" ) def test_poem_tag(self): self.assertEqual( - convert_to_translatable_wikitext("Line 1\nLine 2"), + tag_for_translation("Line 1\nLine 2"), "Line 1\nLine 2" ) def test_code_tag_with_tvar(self): # Assuming process_code_tag assigns tvar names sequentially starting from 0 self.assertEqual( - convert_to_translatable_wikitext("Here is some code for you."), - "Here is some code for you." + tag_for_translation("Here is some code for you."), + """Here is some code for you.""" ) def test_div_tag(self): self.assertEqual( - convert_to_translatable_wikitext("
Div content here.
"), + tag_for_translation("
Div content here.
"), "
Div content here.
" ) def test_hiero_tag(self): self.assertEqual( - convert_to_translatable_wikitext("hieroglyphics"), + tag_for_translation("hieroglyphics"), "hieroglyphics" ) def test_sub_sup_tags(self): self.assertEqual( - convert_to_translatable_wikitext("H2O and E=mc2"), + tag_for_translation("H2O and E=mc2"), "H2O and E=mc2" ) def test_math_tag(self): self.assertEqual( - convert_to_translatable_wikitext("x^2 + y^2 = z^2"), + tag_for_translation("x^2 + y^2 = z^2"), "x^2 + y^2 = z^2" ) def test_small_tag(self): self.assertEqual( - convert_to_translatable_wikitext("Small text"), + tag_for_translation("Small text"), "Small text" ) def test_image_with_upright(self): self.assertEqual( - convert_to_translatable_wikitext("[[File:Example.jpg|upright=1.5|A larger image]]"), + tag_for_translation("[[File:Example.jpg|upright=1.5|A larger image]]"), "[[File:Example.jpg|upright=1.5|A larger image]]" ) def test_multiple_elements_in_one_line(self): self.assertEqual( - convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), - 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' + tag_for_translation("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), + 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' ) def test_text_around_br_tag(self): self.assertEqual( - convert_to_translatable_wikitext("First line.
Second line."), + tag_for_translation("First line.
Second line."), "First line.
Second line." ) def test_empty_string_input(self): self.assertEqual( - convert_to_translatable_wikitext(""), + tag_for_translation(""), "" ) def test_whitespace_only_input(self): self.assertEqual( - convert_to_translatable_wikitext(" \n\t "), + tag_for_translation(" \n\t "), "\t" ) def test_list_items(self): self.assertEqual( - convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"), + tag_for_translation("* Item 1\n** Sub-item 1.1\n* Item 2"), "* Item 1\n** Sub-item 1.1\n* Item 2" ) def test_definition_list(self): self.assertEqual( - convert_to_translatable_wikitext(";Term\n:Definition\n:Description"), + tag_for_translation(";Term\n:Definition\n:Description"), "; Term\n: Definition\n: Description" ) + + def test_standard_internal_link(self): + # Standard link without prefix or pipe. Should use Special:MyLanguage. + # Assumes tag_for_translation calls the logic that produces ... + self.assertEqual( + tag_for_translation("[[Some Page]]"), + """[[Special:MyLanguage/Some Page|Some Page]]""" + ) + + def test_internal_link_with_display_text(self): + # Standard link with display text. Should use Special:MyLanguage. + self.assertEqual( + tag_for_translation("[[About|Read more here]]"), + """[[Special:MyLanguage/About|Read more here]]""" + ) + + def test_simple_language_prefix_no_pipe(self): + # Link starting with a simple language code (e.g., 'bn:'). Should NOT use Special:MyLanguage. + # Should auto-generate the display text without the prefix. + self.assertEqual( + tag_for_translation("[[:it:mozzarella]]"), + """[[:it:mozzarella|mozzarella]]""" + ) + + def test_complex_interwiki_prefix(self): + # Link using a complex interwiki prefix (e.g., :bn:s: for Bengali Wikisource). + # This tests the segment parsing fix implemented. Should NOT use Special:MyLanguage. + self.assertEqual( + tag_for_translation("[[:bn:s:article Title]]"), + """[[:bn:s:article Title|article Title]]""" + ) + + def test_simple_english_special_handling(self): + # Link with the 'en:' prefix, which has special handling using the {{lwp|...}} template. + self.assertEqual( + tag_for_translation("[[:en:kerala]]"), + """[[{{lwp|Kerala}}|kerala]]""" + ) + + def test_complex_english_special_handling(self): + # Link with the 'en:' prefix, which has special handling using the {{lwp|...}} template. + self.assertEqual( + tag_for_translation("[[:en:kerala|text]]"), + """[[{{lwp|Kerala}}|text]]""" + ) if __name__ == '__main__': unittest.main(exit=False, failfast=True) diff --git a/translatable_wikitext_converter/wikitranslator.py b/translatable_wikitext_converter/wikitranslator.py index 002c316..cc614b7 100644 --- a/translatable_wikitext_converter/wikitranslator.py +++ b/translatable_wikitext_converter/wikitranslator.py @@ -16,19 +16,7 @@ def process_syntax_highlight(text): It wraps the content in tags. """ assert(text.startswith('')), "Invalid syntax highlight tag" - # Get inside the tag - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" + return "" + text + "" def process_table(text): """ @@ -76,7 +64,7 @@ def process_poem_tag(text): wrapped_content = _wrap_in_translate(content) return f"{prefix}{wrapped_content}{suffix}" -def process_code_tag(text, tvar_code_id=0): +def process_code_tag(text): """ Processes tags in the wikitext. It wraps the content in tags. @@ -93,7 +81,7 @@ def process_code_tag(text, tvar_code_id=0): if not content: return text # Wrap the content in tags - wrapped_content = f'{content}' + wrapped_content = f'{content}' return f"{prefix}{wrapped_content}{suffix}" def process_div(text): @@ -220,7 +208,7 @@ def process_item(text): item_content = text[offset:].strip() if not item_content: return text - return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' + return text[:offset] + ' ' + convert_to_translatable_wikitext(item_content) + '\n' class double_brackets_types(Enum): wikilink = 1 @@ -229,8 +217,8 @@ class double_brackets_types(Enum): not_inline_icon_file = 4 special = 5 invalid_file = 6 - -def _process_file(s, tvar_inline_icon_id=0): + +def _process_file(s): # Define keywords that should NOT be translated when found as parameters NON_TRANSLATABLE_KEYWORDS = { 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', @@ -291,7 +279,7 @@ def _process_file(s, tvar_inline_icon_id=0): if is_inline_icon: # return something like: [[File:smiley.png|alt=🙂]] - returnline = f'[[' + '|'.join(tokens) + ']]' + returnline = f'[[' + '|'.join(tokens) + ']]' return returnline, double_brackets_types.inline_icon ############################ @@ -327,7 +315,7 @@ def _process_file(s, tvar_inline_icon_id=0): returnline = '[[' + '|'.join(output_parts) + ']]' return returnline, double_brackets_types.not_inline_icon_file -def process_double_brackets(text, tvar_id=0): +def process_double_brackets(text): """ Processes internal links in the wikitext. It wraps the content in tags. @@ -344,7 +332,9 @@ def process_double_brackets(text, tvar_id=0): category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] file_aliases = ['File:', 'file:', 'Image:', 'image:'] - parts[0] = parts[0].strip() # Clean up the first part + # strip all parts + parts = [part.strip() for part in parts] + # Check if the first part is a category or file alias if parts[0].startswith(tuple(category_aliases)): # Handle category links @@ -357,14 +347,174 @@ def process_double_brackets(text, tvar_id=0): # Handle special pages return f'[[{parts[0]}]]', double_brackets_types.special - # Assuming it's a regular internal link + ############################# + # Managing wikilinks + ############################# + + # List of recognised prefixes for Wikimedia projects (e.g., wikipedia, commons) + # and local/national chapters (e.g., wmde, wmit). + interwiki_prefixes = [ + # Main Projects + "wikipedia", "w", + "wiktionary", "wikt", + "wikinews", "n", + "wikibooks", "b", + "wikiquote", "q", + "wikisource", "s", + "oldwikisource", "s:mul", + "wikispecies", "species", + "wikiversity", "v", + "wikivoyage", "voy", + "wikimedia", "foundation", "wmf", + "commons", "c", + "metawiki", "metawikimedia", "metawikipedia", "meta", "m", + "incubator", + "strategy", + "mediawikiwiki", "mw", + "mediazilla", "bugzilla", + "phabricator", "phab", + "testwiki", + "wikidata", "d", + "wikifunctions", "f", + "wikitech", + "toolforge", + + # National Chapters + "wmar", "wmau", "wmbd", "wmbe", "wmbr", "wmca", "wmcz", "wmdk", + "wmde", "wmfi", "wmhk", "wmhu", "wmin", "wmid", "wmil", "wmit", + "wmnl", "wmmk", "wmno", "wmpl", "wmru", "wmrs", "wmes", "wmse", + "wmch", "wmtw", "wmua", "wmuk", + + # Other Wikimedia Prefixes + "betawikiversity", "v:mul", + "download", "dbdump", "gerrit", "mail", "mailarchive", + "outreach", "otrs", "OTRSwiki", "quality", "spcom", + "ticket", "tools", "tswiki", "svn", "sulutil", + "rev", "wmania", "wm2016", "wm2017" + ] + # Convert the list to a set for efficient lookup/checking. + interwiki_prefixes_set = set(interwiki_prefixes) + # Regex to identify if the link starts with a language code (e.g., 'it:', 'bn:'). + LANGUAGE_CODE_PATTERN = re.compile(r'^[a-z]{2,3}:') + + # Determine the link target (before the pipe) and the display text (after the pipe). + link_title = parts[0] + # If a pipe is present, use the part after it; otherwise, use the link target itself. + display_text = parts[1] if len(parts) > 1 else parts[0] + + # --- 1. Checking for Project/Chapter/Interwiki Prefixes --- + + # We try to extract the prefix (e.g. ":bn:" from ":bn:Page") + first_part_lower = link_title.lower() + + has_known_prefix = False + + # A. Check 1: Simple Language Code Match (e.g., ":it:", ":bn:") + # This covers the explicit requirement: "se inizia con un codice linguistico e i due punti..." + if LANGUAGE_CODE_PATTERN.match(first_part_lower): + has_known_prefix = True + + # B. Check 2: Complex Prefix Parsing (Covers "w:", "commons:", "wmde:", or combined forms) + elif ':' in first_part_lower: + # Split the link by colon, excluding the last part which is the page title. + # Example: ":bn:s:Page" -> segments: ['','bn','s'] + # Example: ":w:de:Page" -> segments: ['', 'w','de'] + # Example: ":commons:File" -> segments: ['', 'commons'] + + segments = first_part_lower.split(':') + + # We look at all segments except the last one (which is the actual page title). + # We stop the search if the last segment (the title) is empty, which happens for links ending in a colon. + # e.g., 'w:' splits to ['w', ''] -> we check 'w'. + limit = len(segments) - 1 + if segments[-1] == '': + limit = len(segments) - 2 + + # Iterate through all prefix segments + for segment in segments[:limit]: + # The empty string segment resulting from a leading colon (e.g., ':w:de:Page' -> first segment is '') is ignored. + if segment: + # Check if the segment is a known project/chapter prefix. + if segment in interwiki_prefixes_set: + has_known_prefix = True + break # Stop checking once any known prefix is found + + # Check if the segment is a language code (e.g., 'de' in 'w:de:Page'). + # We can't use the regex pattern here as it checks for start-of-string. + # A quick check for typical language code length (2 or 3 chars) is used as a proxy, + # although a full language code check would be more robust. + if 2 <= len(segment) <= 3: + # Assuming a 2/3 letter segment that isn't a known prefix is treated as a language code + # for the purpose of avoiding Special:MyLanguage. + has_known_prefix = True + break + + # If the link is complex (multiple colons) or contains a known prefix, + # then it is an interwiki link and should not be routed through Special:MyLanguage. + # The check below remains the same, but 'has_known_prefix' is now robustly set. + + if has_known_prefix or ':' in link_title: + # If it has a prefix (linguistic or project/chapter), DO NOT use Special:MyLanguage. + + # --- 2. Special handling for the ":en:" prefix --- + if first_part_lower.startswith(':en:'): + # For links starting with ':en:', rewrite using the {{lwp|...}} template. + + # The suffix is the page title *without* the ":en:" prefix. + en_suffix = link_title[4:] # Removes ":en:" + capitalised_en_suffix = capitalise_first_letter(en_suffix) + # Case 1: No pipe (e.g., "[[en:About]]") + if len(parts) == 1: + # Target: {{lwp|About}}. Display text: About (en_suffix). + return f'[[{{{{lwp|{capitalised_en_suffix}}}}}|{en_suffix}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., "[[en:About|Read More]]") + if len(parts) == 2: + # Target: {{lwp|About}}. Display text: Read More (display_text). + return f'[[{{{{lwp|{capitalised_en_suffix}}}}}|{display_text}]]', double_brackets_types.wikilink + + # --- 3. Handling all other interwiki/prefixed links (e.g., ":it:", "w:", "wmde:") --- + + # Find the index of the *last* colon to correctly separate the page title + # from the potentially complex prefix (e.g., extract 'Page' from 'bn:Page'). + if link_title.rfind(':') != -1: + # Extract the page title by finding the content after the final colon. + title_without_prefix = link_title[link_title.rfind(':') + 1:] + else: + # Should not happen for prefixed links, but handles the fallback gracefully. + title_without_prefix = link_title + + # Case 1: No pipe (e.g., "[[bn:Page]]" or "[[w:Page]]") + if len(parts) == 1: + # Link target remains link_title (e.g., bn:Page). + # Display text is the title *without* the prefix (e.g., Page). + return f'[[{link_title}|{title_without_prefix}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., "[[bn:Page|Text]]") + if len(parts) == 2: + # Link target remains link_title (e.g., bn:Page). + # Display text is the text after the pipe (e.g., Text). + return f'[[{link_title}|{display_text}]]', double_brackets_types.wikilink + + # --- 4. Standard internal links (No special prefix found) --- + + # For standard internal links, the target must be prefixed with Special:MyLanguage + # to enable automatic localisation. 'capitalise_first_letter' is required here. + + # Case 1: No pipe (e.g., [[Page]]) if len(parts) == 1: - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink - if len(parts) == 2 : - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink + # Target: Special:MyLanguage/Page. Display text: Page (link_title). + return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)}|{link_title}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., [[Page|Text]]) + if len(parts) == 2: + # Target: Special:MyLanguage/Page. Display text: Text (display_text). + return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)}|{display_text}]]', double_brackets_types.wikilink + + # Fallback for unexpected link format (e.g., more than one pipe). return text -def process_external_link(text, tvar_url_id=0): +def process_external_link(text): """ Processes external links in the format [http://example.com Description] and ensures that only the description part is wrapped in tags, leaving the URL untouched. @@ -375,7 +525,7 @@ def process_external_link(text, tvar_url_id=0): url_part = match.group(1) description_part = match.group(2) # Wrap only the description part in tags, leave the URL untouched - return f'[{url_part} {description_part}]' + return f'[{url_part} {description_part}]' return text def process_template(text): @@ -406,6 +556,9 @@ def process_raw_url(text): return text return text.strip() +def tag_for_translation(text): + converted_text = convert_to_translatable_wikitext(text) + return set_tvar_names(converted_text) # --- Main Tokenisation Logic --- @@ -439,7 +592,7 @@ def convert_to_translatable_wikitext(wikitext): if last < curr: parts.append((wikitext[last:curr], _wrap_in_translate)) parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) - curr = end_pos + curr = end_pattern last = curr continue # Table block @@ -674,37 +827,33 @@ def convert_to_translatable_wikitext(wikitext): """ # Process links - tvar_id = 0 - tvar_url_id = 0 - tvar_code_id = 0 - tvar_inline_icon_id = 0 for i, (part, handler) in enumerate(parts): # Handlers for links require a tvar_id if handler == process_double_brackets: - new_part, double_brackets_type = handler(part, tvar_id) + new_part, double_brackets_type = handler(part) if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: new_handler = _wrap_in_translate # Change handler to _wrap_in_translate else : new_handler = lambda x: x # No further processing for categories and files parts[i] = (new_part, new_handler) - tvar_id += 1 elif handler == process_external_link: - new_part = handler(part, tvar_url_id) + new_part = handler(part) new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) - tvar_url_id += 1 elif handler == process_code_tag: - new_part = handler(part, tvar_code_id) + new_part = handler(part) new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) - tvar_code_id += 1 elif handler == process_double_brackets : - new_part, double_brackets_type = handler(part, tvar_inline_icon_id) + new_part, double_brackets_type = handler(part) if double_brackets_type == double_brackets_types.inline_icon: new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - tvar_inline_icon_id += 1 else: new_handler = lambda x: x + elif handler == process_syntax_highlight : + new_part = handler(part) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) # Scan again the parts: merge consecutive parts handled by _wrap_in_translate _parts = [] @@ -724,7 +873,7 @@ def convert_to_translatable_wikitext(wikitext): processed_parts = [handler(part) for part, handler in _parts] # Debug output - #""" + """ print("Processed parts:") for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): print(f"--- Start element {i} with handler {handler.__name__} ---") @@ -732,7 +881,7 @@ def convert_to_translatable_wikitext(wikitext): print(f"---\n") print(f'@{ppart}@') print(f"---\n") - #""" + """ # Join the processed parts into a single string out_wikitext = ''.join(processed_parts) @@ -742,4 +891,97 @@ def convert_to_translatable_wikitext(wikitext): out_wikitext = out_wikitext.strip('\n') out_wikitext = out_wikitext.strip(' ') - return out_wikitext \ No newline at end of file + return out_wikitext + +def set_tvar_names(input_text: str) -> str: + """ + Sets the 'name' attribute of every tag inside a block, + using an increasing counter (starting from 1) for each block. + + This version assumes tags are initially simple, e.g., or . + + Args: + input_text: The input string containing and tags. + + Returns: + The modified string with the 'name' attributes set. + """ + + # 1. Regular expression to find all blocks, including content. + # We use re.DOTALL to ensure the match spans multiple lines. + translate_pattern = re.compile(r'(.*?<\/translate>)', re.DOTALL) + + def process_translate_block(full_block_match): + """ + Callback function for re.sub that processes one block. + It finds all simple tags inside and gives them an incremental 'name' attribute. + """ + # The entire matched block + full_block = full_block_match.group(0) + + # Initialise the counter for the current block + count = 1 + + def substitute_simple_tvar(tvar_match): + """ + Inner callback function to substitute a simple and increment the counter. + """ + nonlocal count + + # The match group 1 captures the opening tag parts: ' becomes + # or becomes + + # This expression handles both and by replacing the final '>' or '/>' + # with the insertion plus the captured closing part (group 2). + name_attribute = f' name="{count}"' + + # Group 2 captures the closing element (either '>' or '/>') + closing_part = tvar_match.group(2) + + new_tag = f'{opening_part}{name_attribute}{closing_part}' + + # Increment the counter for the next + count += 1 + + return new_tag + + # Internal pattern: finds or where 'name' is not present. + # This is a robust pattern for HTML/XML tags where an attribute is to be inserted + # right before the closing bracket. + + # Group 1: () - The closing angle bracket (possibly with / for self-closing) + # We need to ensure we don't accidentally match existing 'name' attributes. + + # Simpler pattern for *all* tags, assuming no existing name: + tvar_pattern_inner = re.compile(r'()', re.DOTALL) + + # To strictly avoid tags that *already* contain 'name': + # We use a negative lookahead to ensure "name=" is not present inside + # This pattern is more complex but safer: + tvar_pattern_safer = re.compile(r'(]*name=)[^>]*)(>)', re.IGNORECASE | re.DOTALL) + + # We will utilise the simpler pattern, assuming the context is pre-processing before translation: + tvar_pattern_to_use = re.compile(r'()', re.DOTALL) + + # Apply the substitution to all tags within the current block + modified_block = re.sub( + tvar_pattern_to_use, + substitute_simple_tvar, + full_block + ) + + return modified_block + + # 2. Apply the block processor function to all blocks. + final_result = re.sub( + translate_pattern, + process_translate_block, + input_text + ) + + return final_result \ No newline at end of file From cf1dc5f8b8639501125b7e0c76fb13a8aeef22df Mon Sep 17 00:00:00 2001 From: super-nabla Date: Mon, 15 Dec 2025 01:38:42 +0100 Subject: [PATCH 7/8] change port --- translatable_wikitext_converter/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/translatable_wikitext_converter/app.py b/translatable_wikitext_converter/app.py index 5ed33e1..9b7805b 100644 --- a/translatable_wikitext_converter/app.py +++ b/translatable_wikitext_converter/app.py @@ -47,4 +47,4 @@ def api_convert(): }) if __name__ == '__main__': - app.run(debug=True, port=5001) + app.run(debug=True, port=5000) From 52c115cffe173f5c140ce8a502a34870a5c64ebc Mon Sep 17 00:00:00 2001 From: Super nabla Date: Mon, 15 Dec 2025 01:43:44 +0100 Subject: [PATCH 8/8] Update README with application run instructions Added instructions for running the application and tests. --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1298885..ca1f1ab 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,21 @@ ```bash pip install -r requirements.txt + pip install -e . ``` 4. **Run the Application** + ```bash + flask --app ./translatable_wikitext_converter/app.py run --port 5000 + ``` + As an alternative: ```bash - python app.py + python -m translatable_wikitext_converter.app ``` - +5. **Run the tests** + ```bash + python ./translatable_wikitext_converter/tests.py + ``` The application will start on http://127.0.0.1:5000. ## Usage