diff --git a/README.md b/README.md index 1298885..ca1f1ab 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,21 @@ ```bash pip install -r requirements.txt + pip install -e . ``` 4. **Run the Application** + ```bash + flask --app ./translatable_wikitext_converter/app.py run --port 5000 + ``` + As an alternative: ```bash - python app.py + python -m translatable_wikitext_converter.app ``` - +5. **Run the tests** + ```bash + python ./translatable_wikitext_converter/tests.py + ``` The application will start on http://127.0.0.1:5000. ## Usage diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..397eacb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "translatable-wikitext-converter" +version = "0.1.0" +description = "Convert wikitext into translatable wikitext" +authors = [ + { name = "Gopa Vasanth" } +] +requires-python = ">=3.9" diff --git a/tests.py b/tests.py deleted file mode 100644 index 2112349..0000000 --- a/tests.py +++ /dev/null @@ -1,192 +0,0 @@ -import unittest -from app import convert_to_translatable_wikitext, process_double_brackets - -class TestTranslatableWikitext(unittest.TestCase): - - def test_section_headers(self): - self.assertEqual( - convert_to_translatable_wikitext("==HELLO=="), - "==HELLO==" # Removed the \n\n that was expected - ) - - def test_file_tag_translations(self): - self.assertEqual( - convert_to_translatable_wikitext( - '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]' - ), - '[[File:landscape.jpg|thumb|{{dirstart}}|alt=sunset|Photo of a beautiful landscape]]' - ) - - def test_internal_and_external_links(self): - self.assertEqual( - convert_to_translatable_wikitext( - 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' - ), - 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' - ) - - def test_category_with_translation(self): - self.assertEqual( - convert_to_translatable_wikitext("[[Category:Wikipedia]]"), - "[[Category:Wikipedia{{#translation:}}]]" - ) - - def test_notoc_preserved(self): - self.assertEqual( - convert_to_translatable_wikitext("__NOTOC__"), - "__NOTOC__" - ) - - def test_simple_internal_link(self): - self.assertEqual( - convert_to_translatable_wikitext('[[link]]'), - '[[Special:MyLanguage/Link|link]]' - ) - - def test_multiline_text(self): - self.assertEqual( - convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), - '\nhi iam charan\n
\nhappy\n\n' - ) - - def test_double_namespace_processing(self): - self.assertEqual( - convert_to_translatable_wikitext( - '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' - ), - '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' - ) - - def test_double_namespace_without_list_case_1(self): - self.assertEqual( - convert_to_translatable_wikitext( - '[[Help]]ing' - ), - '[[Special:MyLanguage/Help|Help]]ing' - ) - - def test_double_namespace_without_list_case_2(self): - self.assertEqual( - convert_to_translatable_wikitext( - '[[Help]] ing' - ), - '[[Special:MyLanguage/Help|Help]] ing' - ) - - def test_template_simple(self): - self.assertEqual( - convert_to_translatable_wikitext("{{Template Name}}"), - "{{Template Name}}" - ) - - def test_template_with_parameters(self): - self.assertEqual( - convert_to_translatable_wikitext("{{Template|param1=Value 1|Value 2}}"), - "{{Template|param1=Value 1|Value 2}}" - ) - - def test_template_nested_in_text(self): - self.assertEqual( - convert_to_translatable_wikitext('Some text with {{a template here}} and more text.'), - 'Some text with {{A template here}} and more text.' - ) - - def test_nowiki_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("Some text with [[Raw link]] content."), - "Some text with [[Raw link]] content." - ) - - def test_blockquote_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("
This is a quote.
"), - "
This is a quote.
" - ) - - def test_poem_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("Line 1\nLine 2"), - "Line 1\nLine 2" - ) - - def test_code_tag_with_tvar(self): - # Assuming process_code_tag assigns tvar names sequentially starting from 0 - self.assertEqual( - convert_to_translatable_wikitext("Here is some code for you."), - "Here is some code for you." - ) - - def test_div_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("
Div content here.
"), - "
Div content here.
" - ) - - def test_hiero_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("hieroglyphics"), - "hieroglyphics" - ) - - def test_sub_sup_tags(self): - self.assertEqual( - convert_to_translatable_wikitext("H2O and E=mc2"), - "H2O and E=mc2" - ) - - def test_math_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("x^2 + y^2 = z^2"), - "x^2 + y^2 = z^2" - ) - - def test_small_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("Small text"), - "Small text" - ) - - def test_image_with_upright(self): - self.assertEqual( - convert_to_translatable_wikitext("[[File:Example.jpg|upright=1.5|A larger image]]"), - "[[File:Example.jpg|upright=1.5|A larger image]]" - ) - - def test_multiple_elements_in_one_line(self): - self.assertEqual( - convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), - 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' - ) - - def test_text_around_br_tag(self): - self.assertEqual( - convert_to_translatable_wikitext("First line.
Second line."), - "First line.
Second line." - ) - - def test_empty_string_input(self): - self.assertEqual( - convert_to_translatable_wikitext(""), - "" - ) - - def test_whitespace_only_input(self): - self.assertEqual( - convert_to_translatable_wikitext(" \n\t "), - " \n\t " - ) - - def test_list_items(self): - self.assertEqual( - convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"), - "* Item 1\n** Sub-item 1.1\n* Item 2\n" - ) - - def test_definition_list(self): - self.assertEqual( - convert_to_translatable_wikitext(";Term\n:Definition\n:Description"), - "; Term\n: Definition\n: Description\n" - ) - -if __name__ == '__main__': - unittest.main(exit=False, failfast=True) diff --git a/translatable_wikitext_converter/__init__.py b/translatable_wikitext_converter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/translatable_wikitext_converter/app.py b/translatable_wikitext_converter/app.py new file mode 100644 index 0000000..9b7805b --- /dev/null +++ b/translatable_wikitext_converter/app.py @@ -0,0 +1,50 @@ +from flask import Flask, request, render_template, jsonify +from flask_cors import CORS # Import flask-cors + +from .wikitranslator import tag_for_translation + +app = Flask(__name__) +CORS(app) # Enable CORS for all routes + +@app.route('/') +def index(): + return render_template('home.html') + +@app.route('/convert', methods=['GET']) +def redirect_to_home(): + return render_template('home.html') + +@app.route('/convert', methods=['POST']) +def convert(): + wikitext = request.form.get('wikitext', '') + tagged = tag_for_translation(wikitext) + return render_template('home.html', original=wikitext, converted=tagged) + +@app.route('/api/convert', methods=['GET', 'POST']) +def api_convert(): + if request.method == 'GET': + return """ +

Translate Tagger API

+

Send a POST request with JSON data to use this API.

+

Example:

+
+        curl -X POST https://translatetagger.toolforge.org/api/convert \\
+        -H "Content-Type: application/json" \\
+        -d '{"wikitext": "This is a test [[link|example]]"}'
+        
+ """ + elif request.method == 'POST': + data = request.get_json() + if not data or 'wikitext' not in data: + return jsonify({'error': 'Missing "wikitext" in JSON payload'}), 400 + + wikitext = data.get('wikitext', '') + converted_text = convert_to_translatable_wikitext(wikitext) + + return jsonify({ + 'original': wikitext, + 'converted': converted_text + }) + +if __name__ == '__main__': + app.run(debug=True, port=5000) diff --git a/translatable_wikitext_converter/butta.py b/translatable_wikitext_converter/butta.py new file mode 100644 index 0000000..6cd3995 --- /dev/null +++ b/translatable_wikitext_converter/butta.py @@ -0,0 +1,113 @@ +import re + +def fix_section_title_spacing_internal(title: str) -> str: + """ + Detects a section title and ensures there is exactly one space + between the '=' characters and the title text. + """ + # Pattern: (={2,}) [optional space] (.+?) [optional space] \1 + pattern = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + + # Replacement: \1 [space] \2 [space] \1 + return pattern.sub(r'\1 \2 \1', title) + + + +# --- Main Function to Fix Wiki Page Spacing --- + +def fix_wiki_page_spacing(wiki_text: str) -> str: + """ + Applies the section title spacing fix and enforces consistent newlines + before (one blank line: \n\n) and after (one blank line: \n\n) + every section heading (Level 2 or higher). + + This method guarantees the output format: + ...[Content]\n\n== Title ==\n\n[Next content]... + + :param wiki_text: The full text of the wiki page. + :return: The corrected wiki page text. + """ + + # Pattern to match and replace a heading and its surrounding whitespace: + # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading. + # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading. + # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M). + # 4. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading. + + # We use re.M (multiline) and re.DOTALL (dot matches newline) + heading_and_surroundings_pattern = re.compile( + r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL + ) + + def heading_replacer_full_format(match): + """ + Callback function for re.sub that fixes spacing and enforces \n\n separation. + """ + # Group 1: Content preceding the heading + content_before = match.group(1).rstrip() + # Group 2: The raw heading line + raw_heading = match.group(2) + + # 1. Fix the internal spacing of the heading + corrected_heading = fix_section_title_spacing_internal(raw_heading) + + # 2. Determine the prefix separator: \n\n + # If the heading is the first thing on the page (i.e., content_before is empty), + # we don't want to prepend \n\n. Otherwise, we do. + if content_before: + prefix = '\n\n' + else: + prefix = '' + + # 3. The replacement structure: + # {Content Before}{Prefix}\n{Corrected Heading}\n\n + # The content that follows this match will immediately follow the final \n\n. + return f'{content_before}{prefix}{corrected_heading}\n\n' + + # Apply the fix globally + corrected_text = heading_and_surroundings_pattern.sub( + heading_replacer_full_format, + wiki_text + ) + + # Clean up any residual excess newlines at the very beginning of the page + return corrected_text.lstrip('\r\n') + + +def main(): + """Hard-coded wiki page text for testing and debugging.""" + + # Text demonstrates various input issues: + # 1. Title 1: No internal space, no newline after content. (Needs \n\n before and after) + # 2. Title 2: Too much internal space, one newline after content. + # 3. Title 3: Correct internal space, three newlines after content. + # 4. Title 4: Starts immediately after content (missing newline before). + + raw_wiki_page_text = ( + "This is the header text.\n" + "This is the last line of the header.\n" # Content before first heading + "==Topic1==\n\n\n" # Missing \n before, too many \n after + "Content for topic 1.\n" + "Content continues...\n" + "=== Topic2 ===\n" # Missing \n before, one \n after + "Content for topic 2.\n" + "== Topic3 ==\n\n\n" + "Content for topic 3. Correct space, too many \n after.\n" + "Some more content.\n" + "====Topic4====\n" # Missing \n before, missing \n after + "Final content." + ) + + print("--- Original Wiki Page Text ---\n") + print(raw_wiki_page_text) + print("\n" + "="*60 + "\n") + + corrected_text = fix_wiki_page_spacing(raw_wiki_page_text) + + print("--- Corrected Wiki Page Text (Enforcing: \n\n== Title ==\n\n) ---\n") + print(corrected_text) + print("\n" + "="*60 + "\n") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/templates/home.html b/translatable_wikitext_converter/templates/home.html similarity index 100% rename from templates/home.html rename to translatable_wikitext_converter/templates/home.html diff --git a/translatable_wikitext_converter/tests.py b/translatable_wikitext_converter/tests.py new file mode 100644 index 0000000..00226cb --- /dev/null +++ b/translatable_wikitext_converter/tests.py @@ -0,0 +1,238 @@ +import unittest + +from translatable_wikitext_converter.app import tag_for_translation + +class TestTranslatableWikitext(unittest.TestCase): + + def test_section_headers(self): + self.assertEqual( + tag_for_translation("==HELLO=="), + """\n== HELLO ==\n""" + ) + + def test_file_tag_translations(self): + self.assertEqual( + tag_for_translation( + '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]' + ), + '[[File:landscape.jpg|thumb|{{dirstart}}|alt=sunset|Photo of a beautiful landscape]]' + ) + + def test_internal_and_external_links(self): + self.assertEqual( + tag_for_translation( + 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' + ), + 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' + ) + + def test_category_with_translation(self): + self.assertEqual( + tag_for_translation("[[Category:Wikipedia]]"), + "[[Category:Wikipedia{{#translation:}}]]" + ) + + def test_notoc_preserved(self): + self.assertEqual( + tag_for_translation("__NOTOC__"), + "__NOTOC__" + ) + + def test_simple_internal_link(self): + self.assertEqual( + tag_for_translation('[[link]]'), + '[[Special:MyLanguage/Link|link]]' + ) + + def test_multiline_text(self): + self.assertEqual( + tag_for_translation('\nhi iam charan\n
\nhappy\n\n'), + 'hi iam charan\n
\nhappy' + ) + + def test_double_namespace_processing(self): + self.assertEqual( + tag_for_translation( + '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' + ), + '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' + ) + + def test_double_namespace_without_list_case_1(self): + self.assertEqual( + tag_for_translation( + '[[Help]]ing' + ), + '[[Special:MyLanguage/Help|Help]]ing' + ) + + def test_double_namespace_without_list_case_2(self): + self.assertEqual( + tag_for_translation( + '[[Help]] ing' + ), + '[[Special:MyLanguage/Help|Help]] ing' + ) + + def test_template_simple(self): + self.assertEqual( + tag_for_translation("{{Template Name}}"), + "{{Template Name}}" + ) + + def test_template_with_parameters(self): + self.assertEqual( + tag_for_translation("{{Template|param1=Value 1|Value 2}}"), + "{{Template|param1=Value 1|Value 2}}" + ) + + def test_template_nested_in_text(self): + self.assertEqual( + tag_for_translation('Some text with {{a template here}} and more text.'), + 'Some text with {{A template here}} and more text.' + ) + + def test_nowiki_tag(self): + self.assertEqual( + tag_for_translation("Some text with [[Raw link]] content."), + "Some text with [[Raw link]] content." + ) + + def test_blockquote_tag(self): + self.assertEqual( + tag_for_translation("
This is a quote.
"), + "
This is a quote.
" + ) + + def test_poem_tag(self): + self.assertEqual( + tag_for_translation("Line 1\nLine 2"), + "Line 1\nLine 2" + ) + + def test_code_tag_with_tvar(self): + # Assuming process_code_tag assigns tvar names sequentially starting from 0 + self.assertEqual( + tag_for_translation("Here is some code for you."), + """Here is some code for you.""" + ) + + def test_div_tag(self): + self.assertEqual( + tag_for_translation("
Div content here.
"), + "
Div content here.
" + ) + + def test_hiero_tag(self): + self.assertEqual( + tag_for_translation("hieroglyphics"), + "hieroglyphics" + ) + + def test_sub_sup_tags(self): + self.assertEqual( + tag_for_translation("H2O and E=mc2"), + "H2O and E=mc2" + ) + + def test_math_tag(self): + self.assertEqual( + tag_for_translation("x^2 + y^2 = z^2"), + "x^2 + y^2 = z^2" + ) + + def test_small_tag(self): + self.assertEqual( + tag_for_translation("Small text"), + "Small text" + ) + + def test_image_with_upright(self): + self.assertEqual( + tag_for_translation("[[File:Example.jpg|upright=1.5|A larger image]]"), + "[[File:Example.jpg|upright=1.5|A larger image]]" + ) + + def test_multiple_elements_in_one_line(self): + self.assertEqual( + tag_for_translation("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), + 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' + ) + + def test_text_around_br_tag(self): + self.assertEqual( + tag_for_translation("First line.
Second line."), + "First line.
Second line." + ) + + def test_empty_string_input(self): + self.assertEqual( + tag_for_translation(""), + "" + ) + + def test_whitespace_only_input(self): + self.assertEqual( + tag_for_translation(" \n\t "), + "\t" + ) + + def test_list_items(self): + self.assertEqual( + tag_for_translation("* Item 1\n** Sub-item 1.1\n* Item 2"), + "* Item 1\n** Sub-item 1.1\n* Item 2" + ) + + def test_definition_list(self): + self.assertEqual( + tag_for_translation(";Term\n:Definition\n:Description"), + "; Term\n: Definition\n: Description" + ) + + def test_standard_internal_link(self): + # Standard link without prefix or pipe. Should use Special:MyLanguage. + # Assumes tag_for_translation calls the logic that produces ... + self.assertEqual( + tag_for_translation("[[Some Page]]"), + """[[Special:MyLanguage/Some Page|Some Page]]""" + ) + + def test_internal_link_with_display_text(self): + # Standard link with display text. Should use Special:MyLanguage. + self.assertEqual( + tag_for_translation("[[About|Read more here]]"), + """[[Special:MyLanguage/About|Read more here]]""" + ) + + def test_simple_language_prefix_no_pipe(self): + # Link starting with a simple language code (e.g., 'bn:'). Should NOT use Special:MyLanguage. + # Should auto-generate the display text without the prefix. + self.assertEqual( + tag_for_translation("[[:it:mozzarella]]"), + """[[:it:mozzarella|mozzarella]]""" + ) + + def test_complex_interwiki_prefix(self): + # Link using a complex interwiki prefix (e.g., :bn:s: for Bengali Wikisource). + # This tests the segment parsing fix implemented. Should NOT use Special:MyLanguage. + self.assertEqual( + tag_for_translation("[[:bn:s:article Title]]"), + """[[:bn:s:article Title|article Title]]""" + ) + + def test_simple_english_special_handling(self): + # Link with the 'en:' prefix, which has special handling using the {{lwp|...}} template. + self.assertEqual( + tag_for_translation("[[:en:kerala]]"), + """[[{{lwp|Kerala}}|kerala]]""" + ) + + def test_complex_english_special_handling(self): + # Link with the 'en:' prefix, which has special handling using the {{lwp|...}} template. + self.assertEqual( + tag_for_translation("[[:en:kerala|text]]"), + """[[{{lwp|Kerala}}|text]]""" + ) + +if __name__ == '__main__': + unittest.main(exit=False, failfast=True) diff --git a/app.py b/translatable_wikitext_converter/wikitranslator.py similarity index 66% rename from app.py rename to translatable_wikitext_converter/wikitranslator.py index 0925546..cc614b7 100644 --- a/app.py +++ b/translatable_wikitext_converter/wikitranslator.py @@ -1,94 +1,22 @@ -from flask import Flask, request, render_template, jsonify -from flask_cors import CORS # Import flask-cors -import re from enum import Enum -import sys +import re, sys -app = Flask(__name__) -CORS(app) # Enable CORS for all routes +from .wikitranslator_utils import ( + capitalise_first_letter, + is_emoji_unicode, + fix_wiki_page_spacing, + _wrap_in_translate +) behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] -# --- Helper Functions for Processing Different Wikitext Elements --- -# These functions are designed to handle specific wikitext structures. -# Some will recursively call the main `convert_to_translatable_wikitext` -# function to process their internal content, ensuring nested elements -# are also handled correctly. - -def capitalise_first_letter(text): - """ - Capitalises the first letter of the given text. - If the text is empty or consists only of whitespace, it returns the text unchanged. - """ - if not text or not text.strip(): - return text - return text[0].upper() + text[1:] - -def is_emoji_unicode(char): - # This is a very simplified set of common emoji ranges. - # A comprehensive list would be much longer and more complex. - # See https://www.unicode.org/Public/emoji/ for full details. - if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons - return True - if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs - return True - if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols - return True - if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols - return True - if 0x2700 <= ord(char) <= 0x27BF: # Dingbats - return True - # Add more ranges as needed for full coverage - return False - -def _wrap_in_translate(text): - """ - Wraps the given text with tags. - It ensures that empty or whitespace-only strings are not wrapped. - The tags are added around the non-whitespace content, - preserving leading and trailing whitespace. - """ - if not text or not text.strip(): - return text - - # Find the first and last non-whitespace characters - first_char_index = -1 - last_char_index = -1 - for i, char in enumerate(text): - if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters - if first_char_index == -1: - first_char_index = i - last_char_index = i - - # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) - if first_char_index == -1: - return text - - leading_whitespace = text[:first_char_index] - content = text[first_char_index : last_char_index + 1] - trailing_whitespace = text[last_char_index + 1 :] - - return f"{leading_whitespace}{content}{trailing_whitespace}" - def process_syntax_highlight(text): """ Processes tags in the wikitext. It wraps the content in tags. """ assert(text.startswith('')), "Invalid syntax highlight tag" - # Get inside the tag - start_tag_end = text.find('>') + 1 - end_tag_start = text.rfind('<') - if start_tag_end >= end_tag_start: - return text - prefix = text[:start_tag_end] - content = text[start_tag_end:end_tag_start].strip() - suffix = text[end_tag_start:] - if not content: - return text - # Wrap the content in tags - wrapped_content = _wrap_in_translate(content) - return f"{prefix}{wrapped_content}{suffix}" + return "" + text + "" def process_table(text): """ @@ -136,7 +64,7 @@ def process_poem_tag(text): wrapped_content = _wrap_in_translate(content) return f"{prefix}{wrapped_content}{suffix}" -def process_code_tag(text, tvar_code_id=0): +def process_code_tag(text): """ Processes tags in the wikitext. It wraps the content in tags. @@ -153,7 +81,7 @@ def process_code_tag(text, tvar_code_id=0): if not content: return text # Wrap the content in tags - wrapped_content = f'{content}' + wrapped_content = f'{content}' return f"{prefix}{wrapped_content}{suffix}" def process_div(text): @@ -280,7 +208,7 @@ def process_item(text): item_content = text[offset:].strip() if not item_content: return text - return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' + return text[:offset] + ' ' + convert_to_translatable_wikitext(item_content) + '\n' class double_brackets_types(Enum): wikilink = 1 @@ -289,8 +217,8 @@ class double_brackets_types(Enum): not_inline_icon_file = 4 special = 5 invalid_file = 6 - -def _process_file(s, tvar_inline_icon_id=0): + +def _process_file(s): # Define keywords that should NOT be translated when found as parameters NON_TRANSLATABLE_KEYWORDS = { 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', @@ -351,7 +279,7 @@ def _process_file(s, tvar_inline_icon_id=0): if is_inline_icon: # return something like: [[File:smiley.png|alt=🙂]] - returnline = f'[[' + '|'.join(tokens) + ']]' + returnline = f'[[' + '|'.join(tokens) + ']]' return returnline, double_brackets_types.inline_icon ############################ @@ -387,7 +315,7 @@ def _process_file(s, tvar_inline_icon_id=0): returnline = '[[' + '|'.join(output_parts) + ']]' return returnline, double_brackets_types.not_inline_icon_file -def process_double_brackets(text, tvar_id=0): +def process_double_brackets(text): """ Processes internal links in the wikitext. It wraps the content in tags. @@ -404,7 +332,9 @@ def process_double_brackets(text, tvar_id=0): category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] file_aliases = ['File:', 'file:', 'Image:', 'image:'] - parts[0] = parts[0].strip() # Clean up the first part + # strip all parts + parts = [part.strip() for part in parts] + # Check if the first part is a category or file alias if parts[0].startswith(tuple(category_aliases)): # Handle category links @@ -417,14 +347,174 @@ def process_double_brackets(text, tvar_id=0): # Handle special pages return f'[[{parts[0]}]]', double_brackets_types.special - # Assuming it's a regular internal link + ############################# + # Managing wikilinks + ############################# + + # List of recognised prefixes for Wikimedia projects (e.g., wikipedia, commons) + # and local/national chapters (e.g., wmde, wmit). + interwiki_prefixes = [ + # Main Projects + "wikipedia", "w", + "wiktionary", "wikt", + "wikinews", "n", + "wikibooks", "b", + "wikiquote", "q", + "wikisource", "s", + "oldwikisource", "s:mul", + "wikispecies", "species", + "wikiversity", "v", + "wikivoyage", "voy", + "wikimedia", "foundation", "wmf", + "commons", "c", + "metawiki", "metawikimedia", "metawikipedia", "meta", "m", + "incubator", + "strategy", + "mediawikiwiki", "mw", + "mediazilla", "bugzilla", + "phabricator", "phab", + "testwiki", + "wikidata", "d", + "wikifunctions", "f", + "wikitech", + "toolforge", + + # National Chapters + "wmar", "wmau", "wmbd", "wmbe", "wmbr", "wmca", "wmcz", "wmdk", + "wmde", "wmfi", "wmhk", "wmhu", "wmin", "wmid", "wmil", "wmit", + "wmnl", "wmmk", "wmno", "wmpl", "wmru", "wmrs", "wmes", "wmse", + "wmch", "wmtw", "wmua", "wmuk", + + # Other Wikimedia Prefixes + "betawikiversity", "v:mul", + "download", "dbdump", "gerrit", "mail", "mailarchive", + "outreach", "otrs", "OTRSwiki", "quality", "spcom", + "ticket", "tools", "tswiki", "svn", "sulutil", + "rev", "wmania", "wm2016", "wm2017" + ] + # Convert the list to a set for efficient lookup/checking. + interwiki_prefixes_set = set(interwiki_prefixes) + # Regex to identify if the link starts with a language code (e.g., 'it:', 'bn:'). + LANGUAGE_CODE_PATTERN = re.compile(r'^[a-z]{2,3}:') + + # Determine the link target (before the pipe) and the display text (after the pipe). + link_title = parts[0] + # If a pipe is present, use the part after it; otherwise, use the link target itself. + display_text = parts[1] if len(parts) > 1 else parts[0] + + # --- 1. Checking for Project/Chapter/Interwiki Prefixes --- + + # We try to extract the prefix (e.g. ":bn:" from ":bn:Page") + first_part_lower = link_title.lower() + + has_known_prefix = False + + # A. Check 1: Simple Language Code Match (e.g., ":it:", ":bn:") + # This covers the explicit requirement: "se inizia con un codice linguistico e i due punti..." + if LANGUAGE_CODE_PATTERN.match(first_part_lower): + has_known_prefix = True + + # B. Check 2: Complex Prefix Parsing (Covers "w:", "commons:", "wmde:", or combined forms) + elif ':' in first_part_lower: + # Split the link by colon, excluding the last part which is the page title. + # Example: ":bn:s:Page" -> segments: ['','bn','s'] + # Example: ":w:de:Page" -> segments: ['', 'w','de'] + # Example: ":commons:File" -> segments: ['', 'commons'] + + segments = first_part_lower.split(':') + + # We look at all segments except the last one (which is the actual page title). + # We stop the search if the last segment (the title) is empty, which happens for links ending in a colon. + # e.g., 'w:' splits to ['w', ''] -> we check 'w'. + limit = len(segments) - 1 + if segments[-1] == '': + limit = len(segments) - 2 + + # Iterate through all prefix segments + for segment in segments[:limit]: + # The empty string segment resulting from a leading colon (e.g., ':w:de:Page' -> first segment is '') is ignored. + if segment: + # Check if the segment is a known project/chapter prefix. + if segment in interwiki_prefixes_set: + has_known_prefix = True + break # Stop checking once any known prefix is found + + # Check if the segment is a language code (e.g., 'de' in 'w:de:Page'). + # We can't use the regex pattern here as it checks for start-of-string. + # A quick check for typical language code length (2 or 3 chars) is used as a proxy, + # although a full language code check would be more robust. + if 2 <= len(segment) <= 3: + # Assuming a 2/3 letter segment that isn't a known prefix is treated as a language code + # for the purpose of avoiding Special:MyLanguage. + has_known_prefix = True + break + + # If the link is complex (multiple colons) or contains a known prefix, + # then it is an interwiki link and should not be routed through Special:MyLanguage. + # The check below remains the same, but 'has_known_prefix' is now robustly set. + + if has_known_prefix or ':' in link_title: + # If it has a prefix (linguistic or project/chapter), DO NOT use Special:MyLanguage. + + # --- 2. Special handling for the ":en:" prefix --- + if first_part_lower.startswith(':en:'): + # For links starting with ':en:', rewrite using the {{lwp|...}} template. + + # The suffix is the page title *without* the ":en:" prefix. + en_suffix = link_title[4:] # Removes ":en:" + capitalised_en_suffix = capitalise_first_letter(en_suffix) + # Case 1: No pipe (e.g., "[[en:About]]") + if len(parts) == 1: + # Target: {{lwp|About}}. Display text: About (en_suffix). + return f'[[{{{{lwp|{capitalised_en_suffix}}}}}|{en_suffix}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., "[[en:About|Read More]]") + if len(parts) == 2: + # Target: {{lwp|About}}. Display text: Read More (display_text). + return f'[[{{{{lwp|{capitalised_en_suffix}}}}}|{display_text}]]', double_brackets_types.wikilink + + # --- 3. Handling all other interwiki/prefixed links (e.g., ":it:", "w:", "wmde:") --- + + # Find the index of the *last* colon to correctly separate the page title + # from the potentially complex prefix (e.g., extract 'Page' from 'bn:Page'). + if link_title.rfind(':') != -1: + # Extract the page title by finding the content after the final colon. + title_without_prefix = link_title[link_title.rfind(':') + 1:] + else: + # Should not happen for prefixed links, but handles the fallback gracefully. + title_without_prefix = link_title + + # Case 1: No pipe (e.g., "[[bn:Page]]" or "[[w:Page]]") + if len(parts) == 1: + # Link target remains link_title (e.g., bn:Page). + # Display text is the title *without* the prefix (e.g., Page). + return f'[[{link_title}|{title_without_prefix}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., "[[bn:Page|Text]]") + if len(parts) == 2: + # Link target remains link_title (e.g., bn:Page). + # Display text is the text after the pipe (e.g., Text). + return f'[[{link_title}|{display_text}]]', double_brackets_types.wikilink + + # --- 4. Standard internal links (No special prefix found) --- + + # For standard internal links, the target must be prefixed with Special:MyLanguage + # to enable automatic localisation. 'capitalise_first_letter' is required here. + + # Case 1: No pipe (e.g., [[Page]]) if len(parts) == 1: - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink - if len(parts) == 2 : - return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink + # Target: Special:MyLanguage/Page. Display text: Page (link_title). + return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)}|{link_title}]]', double_brackets_types.wikilink + + # Case 2: With pipe (e.g., [[Page|Text]]) + if len(parts) == 2: + # Target: Special:MyLanguage/Page. Display text: Text (display_text). + return f'[[Special:MyLanguage/{capitalise_first_letter(link_title)}|{display_text}]]', double_brackets_types.wikilink + + # Fallback for unexpected link format (e.g., more than one pipe). return text -def process_external_link(text, tvar_url_id=0): +def process_external_link(text): """ Processes external links in the format [http://example.com Description] and ensures that only the description part is wrapped in tags, leaving the URL untouched. @@ -435,7 +525,7 @@ def process_external_link(text, tvar_url_id=0): url_part = match.group(1) description_part = match.group(2) # Wrap only the description part in tags, leave the URL untouched - return f'[{url_part} {description_part}]' + return f'[{url_part} {description_part}]' return text def process_template(text): @@ -466,6 +556,9 @@ def process_raw_url(text): return text return text.strip() +def tag_for_translation(text): + converted_text = convert_to_translatable_wikitext(text) + return set_tvar_names(converted_text) # --- Main Tokenisation Logic --- @@ -479,6 +572,9 @@ def convert_to_translatable_wikitext(wikitext): if not wikitext: return "" + wikitext = wikitext.replace('\r\n', '\n').replace('\r', '\n') + wikitext = fix_wiki_page_spacing(wikitext) + # add an extra newline at the beginning, useful to process items at the beginning of the text wikitext = '\n' + wikitext @@ -496,7 +592,7 @@ def convert_to_translatable_wikitext(wikitext): if last < curr: parts.append((wikitext[last:curr], _wrap_in_translate)) parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) - curr = end_pos + curr = end_pattern last = curr continue # Table block @@ -731,37 +827,33 @@ def convert_to_translatable_wikitext(wikitext): """ # Process links - tvar_id = 0 - tvar_url_id = 0 - tvar_code_id = 0 - tvar_inline_icon_id = 0 for i, (part, handler) in enumerate(parts): # Handlers for links require a tvar_id if handler == process_double_brackets: - new_part, double_brackets_type = handler(part, tvar_id) + new_part, double_brackets_type = handler(part) if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: new_handler = _wrap_in_translate # Change handler to _wrap_in_translate else : new_handler = lambda x: x # No further processing for categories and files parts[i] = (new_part, new_handler) - tvar_id += 1 elif handler == process_external_link: - new_part = handler(part, tvar_url_id) + new_part = handler(part) new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) - tvar_url_id += 1 elif handler == process_code_tag: - new_part = handler(part, tvar_code_id) + new_part = handler(part) new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) - tvar_code_id += 1 elif handler == process_double_brackets : - new_part, double_brackets_type = handler(part, tvar_inline_icon_id) + new_part, double_brackets_type = handler(part) if double_brackets_type == double_brackets_types.inline_icon: new_handler = _wrap_in_translate # Change handler to _wrap_in_translate - tvar_inline_icon_id += 1 else: new_handler = lambda x: x + elif handler == process_syntax_highlight : + new_part = handler(part) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) # Scan again the parts: merge consecutive parts handled by _wrap_in_translate _parts = [] @@ -785,54 +877,111 @@ def convert_to_translatable_wikitext(wikitext): print("Processed parts:") for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): print(f"--- Start element {i} with handler {handler.__name__} ---") - print(part) + print(f"@{part}@") print(f"---\n") - print(ppart) + print(f'@{ppart}@') print(f"---\n") """ # Join the processed parts into a single string - return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning - -@app.route('/') -def index(): - return render_template('home.html') - -@app.route('/convert', methods=['GET']) -def redirect_to_home(): - return render_template('home.html') - -@app.route('/convert', methods=['POST']) -def convert(): - wikitext = request.form.get('wikitext', '') - converted_text = convert_to_translatable_wikitext(wikitext) - return render_template('home.html', original=wikitext, converted=converted_text) - -@app.route('/api/convert', methods=['GET', 'POST']) -def api_convert(): - if request.method == 'GET': - return """ -

Translate Tagger API

-

Send a POST request with JSON data to use this API.

-

Example:

-
-        curl -X POST https://translatetagger.toolforge.org/api/convert \\
-        -H "Content-Type: application/json" \\
-        -d '{"wikitext": "This is a test [[link|example]]"}'
-        
+ out_wikitext = ''.join(processed_parts) + + # Keep removing all trailing and leading newlines and spaces + while out_wikitext.startswith('\n') or out_wikitext.startswith(' ') or out_wikitext.endswith('\n') or out_wikitext.endswith(' '): + out_wikitext = out_wikitext.strip('\n') + out_wikitext = out_wikitext.strip(' ') + + return out_wikitext + +def set_tvar_names(input_text: str) -> str: + """ + Sets the 'name' attribute of every tag inside a block, + using an increasing counter (starting from 1) for each block. + + This version assumes tags are initially simple, e.g., or . + + Args: + input_text: The input string containing and tags. + + Returns: + The modified string with the 'name' attributes set. + """ + + # 1. Regular expression to find all blocks, including content. + # We use re.DOTALL to ensure the match spans multiple lines. + translate_pattern = re.compile(r'(.*?<\/translate>)', re.DOTALL) + + def process_translate_block(full_block_match): """ - elif request.method == 'POST': - data = request.get_json() - if not data or 'wikitext' not in data: - return jsonify({'error': 'Missing "wikitext" in JSON payload'}), 400 + Callback function for re.sub that processes one block. + It finds all simple tags inside and gives them an incremental 'name' attribute. + """ + # The entire matched block + full_block = full_block_match.group(0) + + # Initialise the counter for the current block + count = 1 + + def substitute_simple_tvar(tvar_match): + """ + Inner callback function to substitute a simple and increment the counter. + """ + nonlocal count + + # The match group 1 captures the opening tag parts: ' becomes + # or becomes + + # This expression handles both and by replacing the final '>' or '/>' + # with the insertion plus the captured closing part (group 2). + name_attribute = f' name="{count}"' + + # Group 2 captures the closing element (either '>' or '/>') + closing_part = tvar_match.group(2) + + new_tag = f'{opening_part}{name_attribute}{closing_part}' + + # Increment the counter for the next + count += 1 + + return new_tag + + # Internal pattern: finds or where 'name' is not present. + # This is a robust pattern for HTML/XML tags where an attribute is to be inserted + # right before the closing bracket. - wikitext = data.get('wikitext', '') - converted_text = convert_to_translatable_wikitext(wikitext) + # Group 1: () - The closing angle bracket (possibly with / for self-closing) + # We need to ensure we don't accidentally match existing 'name' attributes. - return jsonify({ - 'original': wikitext, - 'converted': converted_text - }) + # Simpler pattern for *all* tags, assuming no existing name: + tvar_pattern_inner = re.compile(r'()', re.DOTALL) -if __name__ == '__main__': - app.run(debug=True) + # To strictly avoid tags that *already* contain 'name': + # We use a negative lookahead to ensure "name=" is not present inside + # This pattern is more complex but safer: + tvar_pattern_safer = re.compile(r'(]*name=)[^>]*)(>)', re.IGNORECASE | re.DOTALL) + + # We will utilise the simpler pattern, assuming the context is pre-processing before translation: + tvar_pattern_to_use = re.compile(r'()', re.DOTALL) + + # Apply the substitution to all tags within the current block + modified_block = re.sub( + tvar_pattern_to_use, + substitute_simple_tvar, + full_block + ) + + return modified_block + + # 2. Apply the block processor function to all blocks. + final_result = re.sub( + translate_pattern, + process_translate_block, + input_text + ) + + return final_result \ No newline at end of file diff --git a/translatable_wikitext_converter/wikitranslator_utils.py b/translatable_wikitext_converter/wikitranslator_utils.py new file mode 100644 index 0000000..37860d8 --- /dev/null +++ b/translatable_wikitext_converter/wikitranslator_utils.py @@ -0,0 +1,221 @@ +# --- Utility Functions for Wikitext Conversion --- +# This module contains helper functions that are used across the +# wikitext conversion process. These functions handle tasks such as +# capitalising text, checking for emojis, and wrapping text in +# translation tags. + +import re, sys + +# Pattern to identify section headers (Level 2 or higher) +SECTION_HEADER_PATTERN = re.compile(r'(={2,})\s*(.+?)\s*\1', re.DOTALL) + +# --- Helper Functions for Processing Different Wikitext Elements --- +# These functions are designed to handle specific wikitext structures. +# Some will recursively call the main `convert_to_translatable_wikitext` +# function to process their internal content, ensuring nested elements +# are also handled correctly. + +def capitalise_first_letter(text): + """ + Capitalises the first letter of the given text. + If the text is empty or consists only of whitespace, it returns the text unchanged. + """ + if not text or not text.strip(): + return text + return text[0].upper() + text[1:] + +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + +def _wrap_in_translate(text): + """ + Wraps the text with tags. + If the content starts or ends with a section header, it includes the preceding + or succeeding newline in the translation block. + """ + if not text or not text.strip(): + return text + + # 1. Find the indices of the non-whitespace content + first_char_index = -1 + last_char_index = -1 + + # We loop to find the first/last character that is NOT whitespace + for i, char in enumerate(text): + if char not in (' ', '\n', '\t', '\r', '\f', '\v'): + if first_char_index == -1: + first_char_index = i + last_char_index = i + + if first_char_index == -1: + # If no non-whitespace characters are found, return the original text + return text + + # Initial split + leading_whitespace = text[:first_char_index] + content = text[first_char_index : last_char_index + 1] + trailing_whitespace = text[last_char_index + 1 :] + + # 2. Initial adjustment (To include the newline above the header) + + # We check if the content starts with a section header + # (We use .match() on content to see if the header is at the very beginning) + match_start = SECTION_HEADER_PATTERN.match(content) + + if match_start and leading_whitespace.endswith('\n'): + # If there is a header and the line above is a '\n', we move the '\n' from leading to content + + # We subtract the '\n' from leading_whitespace + leading_whitespace = leading_whitespace[:-1] + + # We recalculate content to include the preceding '\n' + content = text[first_char_index - 1 : last_char_index + 1] + + # We update first_char_index for subsequent calculations (even if not used here) + first_char_index -= 1 + + + # 3. Final adjustment (To include the newline below the header) + + # We find the last match (to see if the header finishes the content block) + last_match = None + for m in SECTION_HEADER_PATTERN.finditer(content): + last_match = m + + if last_match and last_match.end() == len(content) and trailing_whitespace.startswith('\n'): + # If the header is the last thing and the subsequent block starts with '\n', we include it + + # We remove the '\n' from trailing_whitespace + trailing_whitespace = trailing_whitespace[1:] + + # We extend content to include the subsequent '\n' + content = text[first_char_index : last_char_index + 2] # +2 because index is 0-based + + # 4. Returning the result + return f"{leading_whitespace}{content}{trailing_whitespace}" + +############################################ +# Functions for Fixing Wiki Page Spacing # +############################################ + +def fix_section_title_spacing_internal(title: str) -> str: + """ + Detects a section title and ensures there is exactly one space + between the '=' characters and the title text. + """ + # Pattern: (={2,}) [optional space] (.+?) [optional space] \1 + pattern = SECTION_HEADER_PATTERN + + # Replacement: \1 [space] \2 [space] \1 + return pattern.sub(r'\1 \2 \1', title) + +# --- Main Function to Fix Wiki Page Spacing --- + +def fix_wiki_page_spacing(wiki_text: str) -> str: + """ + Applies the section title spacing fix and enforces consistent newlines + before (one blank line: \n\n) and after (one blank line: \n\n) + every section heading (Level 2 or higher). + + This method guarantees the output format: + ...[Content]\n\n== Title ==\n\n[Next content]... + + :param wiki_text: The full text of the wiki page. + :return: The corrected wiki page text. + """ + + # Pattern to match and replace a heading and its surrounding whitespace: + # 1. (.*?) : Group 1: Non-greedy capture of all content before the heading. + # 2. [\r\n\s]* : Non-capturing group for all existing whitespace/newlines before the heading. + # 3. (^={2,}.*?={2,}$) : Group 2: The actual heading line, anchored to the start of a line (re.M). + # 4. [\n\s]* : Non-capturing group for all existing whitespace/newlines after the heading. + + # We use re.M (multiline) and re.DOTALL (dot matches newline) + heading_and_surroundings_pattern = re.compile( + r'(.*?)[\r\n\s]*(^={2,}.*?={2,}$)[\r\n\s]*', re.M | re.DOTALL + ) + + def heading_replacer_full_format(match): + """ + Callback function for re.sub that fixes spacing and enforces \n\n separation. + """ + # Group 1: Content preceding the heading + content_before = match.group(1).rstrip() + # Group 2: The raw heading line + raw_heading = match.group(2) + + # 1. Fix the internal spacing of the heading + corrected_heading = fix_section_title_spacing_internal(raw_heading) + + # 2. Determine the prefix separator: \n\n + # If the heading is the first thing on the page (i.e., content_before is empty), + # we don't want to prepend \n\n. Otherwise, we do. + if content_before: + prefix = '\n\n' + else: + prefix = '' + + # 3. The replacement structure: + # {Content Before}{Prefix}\n{Corrected Heading}\n\n + # The content that follows this match will immediately follow the final \n\n. + return f'{content_before}{prefix}{corrected_heading}\n\n' + + # Apply the fix globally + corrected_text = heading_and_surroundings_pattern.sub( + heading_replacer_full_format, + wiki_text + ) + + # Clean up any residual excess newlines at the very beginning of the page + return corrected_text.lstrip('\n') + +# Aggiunto per permettere l'esecuzione del main +if __name__ == '__main__': + + # --- Dati di Test --- + # Contiene vari casi di spaziatura non corretta per le sezioni: + # 1. Spazi interni errati (sia troppi che mancanti). + # 2. Spazi esterni errati (troppe newline o nessuna newline). + # 3. Intestazione all'inizio della pagina (non deve avere \n\n prima). + # 4. Contenuto in mezzo. + + test_wikitext = """ + +== Ciao == + +ciao +== Ciao == +ciao +== Ciao == + +ciao +""" + + print("--- Test della funzione fix_wiki_page_spacing ---") + print("Testo Wiki Originale:\n" + "-"*30) + print(test_wikitext) + print("-" * 30) + + # Esecuzione della funzione + corrected_wikitext = fix_wiki_page_spacing(test_wikitext) + + print("\nTesto Wiki Corretto:\n" + "="*30) + + # Usiamo repr() per mostrare chiaramente tutte le newline (\n) e gli spazi + print(corrected_wikitext) + print("=" * 30) + \ No newline at end of file