From 3a58ed8f26f26c6905394c5c9b2df1a8b83105e4 Mon Sep 17 00:00:00 2001 From: Pavol Mulinka Date: Tue, 12 Jan 2021 22:40:14 +0100 Subject: [PATCH 1/2] issue fix found in editquality repository test --- revscoring/features/wikitext/datasources/tokenized.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/revscoring/features/wikitext/datasources/tokenized.py b/revscoring/features/wikitext/datasources/tokenized.py index ee241048..9b8e2ba6 100644 --- a/revscoring/features/wikitext/datasources/tokenized.py +++ b/revscoring/features/wikitext/datasources/tokenized.py @@ -455,7 +455,7 @@ def tokenized(text_datasource, name=None, tok_strategy="Latin"): Constructs a :class:`revision.Datasource` that generates a list of tokens """ if name is None: - name = "{0}({1})".format("tokenized", text_datasource) + name = "{0}({1!r}, {2!r})".format("tokenized", text_datasource, tok_strategy) if tok_strategy == "Latin": return Datasource( From d6efb4511b737c2f346400840ee4a2d201fa953d Mon Sep 17 00:00:00 2001 From: Pavol Mulinka Date: Tue, 12 Jan 2021 23:10:33 +0100 Subject: [PATCH 2/2] added minimal test --- .gitignore | 1 + .../features/wikitext/tests/test_tokenized.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/.gitignore b/.gitignore index 93fa611e..0f91b330 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ datasets/ models/*.model # Distribution / packaging +.vscode/ .Python env/ bin/ diff --git a/tests/features/wikitext/tests/test_tokenized.py b/tests/features/wikitext/tests/test_tokenized.py index 8859626e..7c9c839e 100644 --- a/tests/features/wikitext/tests/test_tokenized.py +++ b/tests/features/wikitext/tests/test_tokenized.py @@ -240,3 +240,24 @@ def test_cjk_tokens_features(): def test_tokens_diff_features(): assert (solve(revision.diff.token_delta_increase, cache={r_text: text, p_text: p_text_text}) == 0) assert (solve(revision.diff.token_delta_decrease, cache={r_text: text, p_text: p_text_text}) == -4) + + +# related to https://github.com/wikimedia/editquality/pull/232 +def test_cjk_tokenization_naming_01(): + r_text = revision_oriented.revision.text + r_text_text = 'れた' + + cache = {r_text: r_text_text} + + assert (list(solve([revision.cjk_chars, revision.cjk.tokens], cache=cache)) == + [2, 2.0]) + + +def test_cjk_tokenization_naming_02(): + r_text = revision_oriented.revision.text + r_text_text = 'れた' + + cache = {r_text: r_text_text} + + assert (list(solve([revision.cjk.tokens, revision.cjk_chars], cache=cache)) == + [2.0, 2])