diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 63cf17b86..81e5eb85c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -239,15 +239,15 @@ bazel test //data/config:config_dict_validation_test 2. **`s2tw.json`** - 簡體轉臺灣正體 - 使用 `STPhrases.txt`、`STCharacters.txt` - - 額外使用 `TWVariants.txt` + - 額外使用 `TWVariantsPhrases.txt`、`TWVariants.txt` 3. **`s2twp.json`** - 簡體轉臺灣正體(含慣用詞) - 使用 `STPhrases.txt`、`STCharacters.txt` - - 額外使用 `TWPhrases.txt`、`TWVariants.txt` + - 額外使用 `TWPhrases.txt`、`TWVariantsPhrases.txt`、`TWVariants.txt` 4. **`s2hk.json`** - 簡體轉香港繁體 - 使用 `STPhrases.txt`、`STCharacters.txt` - - 額外使用 `HKVariants.txt` + - 額外使用 `HKVariantsPhrases.txt`、`HKVariants.txt` ### 測試建議 @@ -272,8 +272,8 @@ bazel test //data/config:config_dict_validation_test - **僅修改基本簡繁對應**:修改 `STCharacters.txt`,測試至少包含 `s2t` - **修改詞組轉換**:修改 `STPhrases.txt`,測試包含 `s2t`、`s2tw`、`s2twp`、`s2hk` -- **臺灣特有用詞**:修改 `TWPhrases*.txt` 或 `TWVariants.txt`,測試包含 `s2tw`、`s2twp` -- **香港特有用詞**:修改 `HKVariants*.txt`,測試包含 `s2hk` +- **臺灣特有用詞**:修改 `TWPhrases*.txt` 或 `TWVariantsPhrases.txt`、`TWVariants.txt`,測試包含 `s2tw`、`s2twp` +- **香港特有用詞**:修改 `HKVariantsPhrases.txt`、`HKVariants*.txt`,測試包含 `s2hk` ## 提交變更 diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 3b7057aaf..562e3b183 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -14,8 +14,10 @@ set( TWPhrases TWPhrasesRev TWVariants + TWVariantsPhrases TWVariantsRevPhrases HKVariants + HKVariantsPhrases HKVariantsRevPhrases JPVariants JPShinjitaiCharacters diff --git a/data/config/s2hk.json b/data/config/s2hk.json index fcaa017ee..9bbb0cbbd 100644 --- a/data/config/s2hk.json +++ b/data/config/s2hk.json @@ -20,8 +20,14 @@ } }, { "dict": { - "type": "ocd2", - "file": "HKVariants.ocd2" + "type": "group", + "dicts": [{ + "type": "ocd2", + "file": "HKVariantsPhrases.ocd2" + }, { + "type": "ocd2", + "file": "HKVariants.ocd2" + }] } }] } diff --git a/data/config/s2tw.json b/data/config/s2tw.json index 2a3d7656b..48cc7fcaf 100644 --- a/data/config/s2tw.json +++ b/data/config/s2tw.json @@ -20,8 +20,14 @@ } }, { "dict": { - "type": "ocd2", - "file": "TWVariants.ocd2" + "type": "group", + "dicts": [{ + "type": "ocd2", + "file": "TWVariantsPhrases.ocd2" + }, { + "type": "ocd2", + "file": "TWVariants.ocd2" + }] } }] } diff --git a/data/config/s2twp.json b/data/config/s2twp.json index 2f36e9352..f545cf7ed 100644 --- a/data/config/s2twp.json +++ b/data/config/s2twp.json @@ -25,8 +25,14 @@ } }, { "dict": { - "type": "ocd2", - "file": "TWVariants.ocd2" + "type": "group", + "dicts": [{ + "type": "ocd2", + "file": "TWVariantsPhrases.ocd2" + }, { + "type": "ocd2", + "file": "TWVariants.ocd2" + }] } }] } diff --git a/data/config/t2hk.json b/data/config/t2hk.json index 519d4a3fd..b4c0d4066 100644 --- a/data/config/t2hk.json +++ b/data/config/t2hk.json @@ -4,13 +4,19 @@ "type": "mmseg", "dict": { "type": "ocd2", - "file": "HKVariants.ocd2" + "file": "HKVariantsPhrases.ocd2" } }, "conversion_chain": [{ "dict": { - "type": "ocd2", - "file": "HKVariants.ocd2" + "type": "group", + "dicts": [{ + "type": "ocd2", + "file": "HKVariantsPhrases.ocd2" + }, { + "type": "ocd2", + "file": "HKVariants.ocd2" + }] } }] } diff --git a/data/config/t2tw.json b/data/config/t2tw.json index 0394f600d..bf1a2379f 100644 --- a/data/config/t2tw.json +++ b/data/config/t2tw.json @@ -4,13 +4,19 @@ "type": "mmseg", "dict": { "type": "ocd2", - "file": "TWVariants.ocd2" + "file": "TWVariantsPhrases.ocd2" } }, "conversion_chain": [{ "dict": { - "type": "ocd2", - "file": "TWVariants.ocd2" + "type": "group", + "dicts": [{ + "type": "ocd2", + "file": "TWVariantsPhrases.ocd2" + }, { + "type": "ocd2", + "file": "TWVariants.ocd2" + }] } }] } diff --git a/data/dictionary/DictionaryTest.cpp b/data/dictionary/DictionaryTest.cpp index 69cbf1ecb..1028f4406 100644 --- a/data/dictionary/DictionaryTest.cpp +++ b/data/dictionary/DictionaryTest.cpp @@ -65,10 +65,10 @@ std::string DictionaryRunfilesTest::runfile_dir_; INSTANTIATE_TEST_SUITE_P( , DictionaryTest, ::testing::Values( - "HKVariants", "HKVariantsRev", "HKVariantsRevPhrases", + "HKVariants", "HKVariantsPhrases", "HKVariantsRev", "HKVariantsRevPhrases", "JPShinjitaiCharacters", "JPShinjitaiPhrases", "JPVariants", "JPVariantsRev", "STCharacters", "STPhrases", "TSCharacters", - "TSPhrases", "TWPhrases", "TWPhrasesRev", "TWVariants", + "TSPhrases", "TWPhrases", "TWPhrasesRev", "TWVariants", "TWVariantsPhrases", "TWVariantsRev", "TWVariantsRevPhrases"), [](const testing::TestParamInfo& info) { return info.param; diff --git a/data/dictionary/HKVariantsPhrases.txt b/data/dictionary/HKVariantsPhrases.txt new file mode 100644 index 000000000..bbcfcd4a7 --- /dev/null +++ b/data/dictionary/HKVariantsPhrases.txt @@ -0,0 +1,9 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: HKVariantsPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2hk.json, t2hk.json + +喫茶小舖 喫茶小舖 +純喫茶 純喫茶 diff --git a/data/dictionary/TWPhrases.txt b/data/dictionary/TWPhrases.txt index 3639158b3..d600f7418 100644 --- a/data/dictionary/TWPhrases.txt +++ b/data/dictionary/TWPhrases.txt @@ -111,7 +111,6 @@ U盤 隨身碟 哈希 雜湊 哈薩克斯坦 哈薩克 哥斯達黎加 哥斯大黎加 -喫茶小舖 喫茶小舖 單片機 微控制器 回調 回撥 固件 韌體 @@ -362,7 +361,6 @@ U盤 隨身碟 粘貼 貼上 粘貼 紅心大戰 傷心小棧 納米 奈米 -純喫茶 純喫茶 索馬里 索馬利亞 組件 元件 綁定 繫結 diff --git a/data/dictionary/TWPhrasesRev.txt b/data/dictionary/TWPhrasesRev.txt index 909e374bd..2706b19b9 100644 --- a/data/dictionary/TWPhrasesRev.txt +++ b/data/dictionary/TWPhrasesRev.txt @@ -111,7 +111,6 @@ SQL隱碼攻擊 SQL注入 SQL注入攻擊 哈薩克 哈薩克斯坦 哥斯大黎加 哥斯達黎加 啟用 激活 -喫茶小舖 喫茶小舖 喬治亞 格魯吉亞 單核心 宏內核 回撥 回調 @@ -338,7 +337,6 @@ SQL隱碼攻擊 SQL注入 SQL注入攻擊 簽帳金融卡 借記卡 粘貼 粘貼 紐西蘭 新西蘭 -純喫茶 純喫茶 索羅門群島 所羅門羣島 索馬利亞 索馬里 終端使用者 最終用戶 diff --git a/data/dictionary/TWVariantsPhrases.txt b/data/dictionary/TWVariantsPhrases.txt new file mode 100644 index 000000000..d2a352bfb --- /dev/null +++ b/data/dictionary/TWVariantsPhrases.txt @@ -0,0 +1,9 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TWVariantsPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2tw.json, s2twp.json, t2tw.json + +喫茶小舖 喫茶小舖 +純喫茶 純喫茶 diff --git a/node/dicts.gypi b/node/dicts.gypi index 10ef4abff..2e00bb217 100644 --- a/node/dicts.gypi +++ b/node/dicts.gypi @@ -51,6 +51,14 @@ "inputs": ["<(input)"], "outputs": ["<(output_prefix)TSPhrases.ocd2"], "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] + }, { + "action_name": "TWVariantsPhrases", + "variables": { + "input": "<(input_prefix)TWVariantsPhrases.txt", + }, + "inputs": ["<(input)"], + "outputs": ["<(output_prefix)TWVariantsPhrases.ocd2"], + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWVariants", "variables": { @@ -107,6 +115,14 @@ "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWPhrasesRev.ocd2"], "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] + }, { + "action_name": "HKVariantsPhrases", + "variables": { + "input": "<(input_prefix)HKVariantsPhrases.txt", + }, + "inputs": ["<(input)"], + "outputs": ["<(output_prefix)HKVariantsPhrases.ocd2"], + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "HKVariants", "variables": { diff --git a/test/testcases/testcases.json b/test/testcases/testcases.json index 9b6599768..d911f8aa9 100755 --- a/test/testcases/testcases.json +++ b/test/testcases/testcases.json @@ -473,6 +473,23 @@ "tw2s": "社群 索罗门群岛 核取方块 核取按钮", "t2s": "社群 索罗门群岛 核取方块 核取按钮" } + }, + { + "id": "BYVoid_OpenCC_PR_447_s", + "input": "拜访吃茶小舖和纯吃茶", + "expected": { + "s2tw": "拜訪喫茶小舖和純喫茶", + "s2twp": "拜訪喫茶小舖和純喫茶", + "s2hk": "拜訪喫茶小舖和純喫茶" + } + }, + { + "id": "BYVoid_OpenCC_PR_447_t", + "input": "拜訪喫茶小舖和純喫茶", + "expected": { + "t2tw": "拜訪喫茶小舖和純喫茶", + "s2hk": "拜訪喫茶小舖和純喫茶" + } } ] }