diff --git a/data/dictionary/DictionaryTest.cpp b/data/dictionary/DictionaryTest.cpp index 93d62ca8..69cbf1ec 100644 --- a/data/dictionary/DictionaryTest.cpp +++ b/data/dictionary/DictionaryTest.cpp @@ -135,30 +135,38 @@ TEST_F(DictionaryRunfilesTest, TWPhrasesReverseMapping) { return map; }; - LexiconPtr twPhrases = loadLexicon(twPhrasesFile); - LexiconPtr twPhrasesRev = loadLexicon(twPhrasesRevFile); - ASSERT_NE(twPhrases, nullptr); - ASSERT_NE(twPhrasesRev, nullptr); - - auto twMap = buildMap(twPhrases); - auto twRevMap = buildMap(twPhrasesRev); - - for (const auto& entry : twMap) { - const std::string& key = entry.first; - for (const auto& value : entry.second) { - auto it = twRevMap.find(value); - EXPECT_TRUE(it != twRevMap.end() && it->second.count(key) > 0) - << "Missing reverse mapping: " << key << " -> " << value; + try { + LexiconPtr twPhrases = loadLexicon(twPhrasesFile); + LexiconPtr twPhrasesRev = loadLexicon(twPhrasesRevFile); + ASSERT_NE(twPhrases, nullptr); + ASSERT_NE(twPhrasesRev, nullptr); + + auto twMap = buildMap(twPhrases); + auto twRevMap = buildMap(twPhrasesRev); + + for (const auto& entry : twMap) { + const std::string& key = entry.first; + for (const auto& value : entry.second) { + auto it = twRevMap.find(value); + EXPECT_TRUE(it != twRevMap.end() && it->second.count(key) > 0) + << "Missing reverse mapping: " << key << " -> " << value; + } } - } - for (const auto& entry : twRevMap) { - const std::string& key = entry.first; - for (const auto& value : entry.second) { - auto it = twMap.find(value); - EXPECT_TRUE(it != twMap.end() && it->second.count(key) > 0) - << "Missing reverse mapping: " << key << " -> " << value; + for (const auto& entry : twRevMap) { + const std::string& key = entry.first; + for (const auto& value : entry.second) { + auto it = twMap.find(value); + EXPECT_TRUE(it != twMap.end() && it->second.count(key) > 0) + << "Missing reverse mapping: " << key << " -> " << value; + } } + } catch (const Exception& ex) { + FAIL() << "Exception: " << ex.what(); + } catch (const std::exception& ex) { + FAIL() << "std::exception: " << ex.what(); + } catch (...) { + FAIL() << "Unknown exception thrown during reverse mapping check."; } } diff --git a/data/dictionary/HKVariants.txt b/data/dictionary/HKVariants.txt index e0f68813..37d77a2a 100644 --- a/data/dictionary/HKVariants.txt +++ b/data/dictionary/HKVariants.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: HKVariants.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2hk.json, t2hk.json + 僞 偽 兌 兑 叄 叁 diff --git a/data/dictionary/HKVariantsRevPhrases.txt b/data/dictionary/HKVariantsRevPhrases.txt index 3f03fd89..5256bd05 100644 --- a/data/dictionary/HKVariantsRevPhrases.txt +++ b/data/dictionary/HKVariantsRevPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: HKVariantsRevPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: hk2s.json, hk2t.json + 一口吃個 一口喫個 一口吃成 一口喫成 一家三口 一家三口 diff --git a/data/dictionary/JPShinjitaiCharacters.txt b/data/dictionary/JPShinjitaiCharacters.txt index 30220aa3..beaa192a 100644 --- a/data/dictionary/JPShinjitaiCharacters.txt +++ b/data/dictionary/JPShinjitaiCharacters.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: JPShinjitaiCharacters.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: jp2t.json + 両 兩 輛 弁 辨 辯 瓣 辦 弁 御 御 禦 diff --git a/data/dictionary/JPShinjitaiPhrases.txt b/data/dictionary/JPShinjitaiPhrases.txt index 3a85c886..8fcbb9e7 100644 --- a/data/dictionary/JPShinjitaiPhrases.txt +++ b/data/dictionary/JPShinjitaiPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: JPShinjitaiPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: jp2t.json + 一獲千金 一攫千金 丁寧 叮嚀 丁重 鄭重 diff --git a/data/dictionary/JPVariants.txt b/data/dictionary/JPVariants.txt index 3f90b90d..a9cfa000 100644 --- a/data/dictionary/JPVariants.txt +++ b/data/dictionary/JPVariants.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: JPVariants.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: t2jp.json + 乘 乗 亂 乱 亙 亘 diff --git a/data/dictionary/STCharacters.txt b/data/dictionary/STCharacters.txt index 7347645a..90604775 100644 --- a/data/dictionary/STCharacters.txt +++ b/data/dictionary/STCharacters.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: STCharacters.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2hk.json, s2t.json, s2tw.json, s2twp.json + 㐷 傌 㐹 㑶 㐹 㐽 偑 diff --git a/data/dictionary/STPhrases.txt b/data/dictionary/STPhrases.txt index 21aa4ccd..b92e2273 100644 --- a/data/dictionary/STPhrases.txt +++ b/data/dictionary/STPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: STPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2hk.json, s2t.json, s2tw.json, s2twp.json + 㓦划 㓦劃 一丝不挂 一絲不掛 一了心愿 一了心願 diff --git a/data/dictionary/TSCharacters.txt b/data/dictionary/TSCharacters.txt index a2365145..31361395 100644 --- a/data/dictionary/TSCharacters.txt +++ b/data/dictionary/TSCharacters.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TSCharacters.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: hk2s.json, t2s.json, tw2s.json, tw2sp.json + 㑮 𫝈 㑯 㑔 㑳 㑇 diff --git a/data/dictionary/TSPhrases.txt b/data/dictionary/TSPhrases.txt index 792a1cad..7d13948d 100644 --- a/data/dictionary/TSPhrases.txt +++ b/data/dictionary/TSPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TSPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: hk2s.json, t2s.json, tw2s.json, tw2sp.json + 一目瞭然 一目了然 上鍊 上链 不瞭解 不了解 diff --git a/data/dictionary/TWPhrases.txt b/data/dictionary/TWPhrases.txt index be6ac7a3..9b0a7613 100644 --- a/data/dictionary/TWPhrases.txt +++ b/data/dictionary/TWPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TWPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2twp.json (via TWPhrases.ocd2) + PN結 PN接面 SQL注入 SQL隱碼攻擊 SQL注入攻擊 SQL隱碼攻擊 diff --git a/data/dictionary/TWPhrasesRev.txt b/data/dictionary/TWPhrasesRev.txt index c8a3d19a..820a9140 100644 --- a/data/dictionary/TWPhrasesRev.txt +++ b/data/dictionary/TWPhrasesRev.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TWPhrasesRev.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: tw2sp.json (via TWPhrasesRev.ocd2) + PN接面 PN結 SQL隱碼攻擊 SQL注入 SQL注入攻擊 三極體 三極管 diff --git a/data/dictionary/TWVariants.txt b/data/dictionary/TWVariants.txt index 023a0687..cadffb17 100644 --- a/data/dictionary/TWVariants.txt +++ b/data/dictionary/TWVariants.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TWVariants.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: s2tw.json, s2twp.json, t2tw.json + 僞 偽 啓 啟 喫 吃 diff --git a/data/dictionary/TWVariantsRevPhrases.txt b/data/dictionary/TWVariantsRevPhrases.txt index ec94209d..05c774d9 100644 --- a/data/dictionary/TWVariantsRevPhrases.txt +++ b/data/dictionary/TWVariantsRevPhrases.txt @@ -1,3 +1,10 @@ +# Open Chinese Convert (OpenCC) Dictionary +# File: TWVariantsRevPhrases.txt +# Format: key value(s) (values separated by spaces) +# License: Apache-2.0 (see LICENSE) +# Source: https://github.com/ByVoid/OpenCC +# Used in configs: tw2s.json, tw2sp.json, tw2t.json + 一口吃個 一口喫個 一口吃成 一口喫成 一家三口 一家三口 diff --git a/data/scripts/common.py b/data/scripts/common.py index addd3c02..83a7d440 100644 --- a/data/scripts/common.py +++ b/data/scripts/common.py @@ -6,26 +6,174 @@ def sort_items(input_filename, output_filename): input_file = codecs.open(input_filename, "r", encoding="utf-8") - dic = {} - - for line in input_file: - if len(line) == 0 or line == '\n': - continue - try: - key, value = line.split("\t") - except ValueError: - print(line) - while value[-1] == "\n" or value[-1] == "\r": - value = value[:-1] - dic[key] = value + lines = [line.rstrip("\r\n") for line in input_file] input_file.close() + def line_type(line): + if line == "" or line.strip() == "": + return "empty" + if line.startswith("#"): + return "comment" + if "\t" in line: + return "entry" + raise ValueError("Invalid dictionary line: " + line) + + parsed = [] + for line in lines: + parsed.append({"type": line_type(line), "content": line}) + + entry_lines = [i for i, p in enumerate(parsed) if p["type"] == "entry"] + if not entry_lines: + header_blocks = [] + current = [] + for p in parsed: + if p["type"] == "comment": + current.append(p["content"]) + elif p["type"] == "empty": + if current: + header_blocks.append(list(current)) + current = [] + if current: + header_blocks.append(list(current)) + + output_file = open(output_filename, "wb") + for idx, block in enumerate(header_blocks): + for line in block: + output_file.write((line + "\n").encode("utf-8")) + if idx < len(header_blocks) - 1: + output_file.write(b"\n") + if header_blocks: + output_file.write(b"\n") + output_file.close() + return + + first_entry = entry_lines[0] + last_entry = entry_lines[-1] + + header_end = -1 + for i in range(first_entry - 1, -1, -1): + if parsed[i]["type"] == "empty": + header_end = i + break + + header_blocks = [] + current = [] + for i in range(0, header_end + 1): + if parsed[i]["type"] == "comment": + current.append(parsed[i]["content"]) + elif parsed[i]["type"] == "empty": + if current: + header_blocks.append(list(current)) + current = [] + if current: + header_blocks.append(list(current)) + + footer_blocks = [] + current = [] + for i in range(last_entry + 1, len(parsed)): + if parsed[i]["type"] == "comment": + current.append(parsed[i]["content"]) + elif parsed[i]["type"] == "empty": + if current: + footer_blocks.append(list(current)) + current = [] + if current: + footer_blocks.append(list(current)) + + annotated_entries = [] + floating_blocks = [] + current = [] + entry_index = 0 + for i in range(header_end + 1, last_entry + 1): + p = parsed[i] + if p["type"] == "comment": + current.append(p["content"]) + continue + if p["type"] == "empty": + if current: + floating_blocks.append({"anchor": entry_index, "lines": list(current)}) + current = [] + continue + if p["type"] == "entry": + attached = None + if current: + has_empty = False + for j in range(i - 1, -1, -1): + if parsed[j]["type"] == "entry": + break + if parsed[j]["type"] == "empty": + has_empty = True + break + if has_empty: + floating_blocks.append({"anchor": entry_index, "lines": list(current)}) + else: + attached = list(current) + current = [] + + key, value = p["content"].split("\t", 1) + annotated_entries.append( + { + "key": key, + "value": value, + "attached": attached, + "original_index": entry_index, + } + ) + entry_index += 1 + + if current: + floating_blocks.append({"anchor": entry_index, "lines": list(current)}) + + annotated_entries.sort(key=lambda e: e["key"]) + index_map = {e["original_index"]: i for i, e in enumerate(annotated_entries)} + for block in floating_blocks: + if block["anchor"] in index_map: + block["anchor"] = index_map[block["anchor"]] + else: + block["anchor"] = len(annotated_entries) + + floating_by_anchor = {} + for block in floating_blocks: + floating_by_anchor.setdefault(block["anchor"], []).append(block["lines"]) + output_file = open(output_filename, "wb") - for key in sorted(dic.keys()): - line = key + "\t" + dic[key] + "\n" - output_file.write(line.encode('utf-8')) + for idx, block in enumerate(header_blocks): + for line in block: + output_file.write((line + "\n").encode("utf-8")) + if idx < len(header_blocks) - 1: + output_file.write(b"\n") + if header_blocks and annotated_entries: + output_file.write(b"\n") + + for i, entry in enumerate(annotated_entries): + for block in floating_by_anchor.get(i, []): + output_file.write(b"\n") + for line in block: + output_file.write((line + "\n").encode("utf-8")) + output_file.write(b"\n") + + if entry["attached"]: + for line in entry["attached"]: + output_file.write((line + "\n").encode("utf-8")) + output_file.write( + (entry["key"] + "\t" + entry["value"] + "\n").encode("utf-8") + ) + + for block in floating_by_anchor.get(len(annotated_entries), []): + output_file.write(b"\n") + for line in block: + output_file.write((line + "\n").encode("utf-8")) + + if footer_blocks: + if annotated_entries: + output_file.write(b"\n") + for idx, block in enumerate(footer_blocks): + for line in block: + output_file.write((line + "\n").encode("utf-8")) + if idx < len(footer_blocks) - 1: + output_file.write(b"\n") output_file.close() @@ -35,7 +183,8 @@ def reverse_items(input_filename, output_filename): dic = {} for line in input_file: - if len(line) == 0: + stripped = line.strip() + if not stripped or stripped.startswith("#"): continue key, value = line.split("\t") while value[-1] == "\n" or value[-1] == "\r": @@ -62,7 +211,8 @@ def reverse_items(input_filename, output_filename): def find_target_items(input_filename, keyword): input_file = codecs.open(input_filename, "r", encoding="utf-8") for line in input_file: - if len(line) == 0: + stripped = line.strip() + if not stripped or stripped.startswith("#"): continue key, value = line.split("\t") while value[-1] == "\n" or value[-1] == "\r": diff --git a/src/BUILD.bazel b/src/BUILD.bazel index e1e5f24d..ddc60b5f 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -275,6 +275,16 @@ cc_library( ], ) +cc_test( + name = "lexicon_annotation_test", + srcs = ["LexiconAnnotationTest.cpp"], + deps = [ + ":text_dict", + ":text_dict_test_base", + "@googletest//:gtest_main", + ], +) + cc_library( name = "marisa_dict", srcs = ["MarisaDict.cpp"], @@ -322,7 +332,10 @@ cc_library( name = "phrase_extract", srcs = ["PhraseExtract.cpp"], hdrs = ["PhraseExtract.hpp"], - visibility = ["//src/tools:__pkg__"], + visibility = [ + "//src:__pkg__", + "//src/tools:__pkg__", + ], deps = [ ":common", ":marisa_dict", @@ -330,6 +343,17 @@ cc_library( ], ) +cc_test( + name = "phrase_extract_test", + srcs = ["PhraseExtractTest.cpp"], + deps = [ + ":phrase_extract", + ":test_utils", + ":test_utils_utf8", + "@googletest//:gtest_main", + ], +) + pybind_extension( name = "opencc_clib", srcs = ["py_opencc.cpp"], @@ -470,6 +494,16 @@ cc_library( ], ) +cc_test( + name = "utf8_string_slice_test", + srcs = ["UTF8StringSliceTest.cpp"], + deps = [ + ":test_utils", + ":utf8_string_slice", + "@googletest//:gtest_main", + ], +) + cc_library( name = "utf8_util", srcs = ["UTF8Util.cpp"], diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7768c89d..227e6c65 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,6 +72,7 @@ set(UNITTESTS ConversionChainTest ConversionTest DictGroupTest + LexiconAnnotationTest MarisaDictTest MaxMatchSegmentationTest PhraseExtractTest diff --git a/src/DictConverter.cpp b/src/DictConverter.cpp index 40e46d1f..d9953f83 100644 --- a/src/DictConverter.cpp +++ b/src/DictConverter.cpp @@ -17,8 +17,11 @@ */ #include "DictConverter.hpp" +#include "Exception.hpp" +#include "Lexicon.hpp" #include "MarisaDict.hpp" #include "TextDict.hpp" +#include "UTF8Util.hpp" #ifdef ENABLE_DARTS #include "DartsDict.hpp" @@ -29,7 +32,19 @@ using namespace opencc; DictPtr LoadDictionary(const std::string& format, const std::string& inputFileName) { if (format == "text") { - return SerializableDict::NewFromFile(inputFileName); + FILE* fp = +#ifdef _MSC_VER + _wfopen(UTF8Util::GetPlatformString(inputFileName).c_str(), L"r") +#else + fopen(UTF8Util::GetPlatformString(inputFileName).c_str(), "r") +#endif + ; + if (!fp) { + throw FileNotFound(inputFileName); + } + DictPtr dict = TextDict::NewFromFile(fp); + fclose(fp); + return dict; } else if (format == "ocd") { #ifdef ENABLE_DARTS return SerializableDict::NewFromFile(inputFileName); diff --git a/src/Lexicon.cpp b/src/Lexicon.cpp index cfb215c4..4429edf7 100644 --- a/src/Lexicon.cpp +++ b/src/Lexicon.cpp @@ -81,8 +81,13 @@ LexiconPtr Lexicon::ParseLexiconFromFile(FILE* fp) { char buff[ENTRY_BUFF_SIZE]; LexiconPtr lexicon(new Lexicon); UTF8Util::SkipUtf8Bom(fp); + size_t lineNum = 1; while (fgets(buff, ENTRY_BUFF_SIZE, fp)) { + if (*buff == '#') { + lineNum++; + continue; + } DictEntry* entry = ParseKeyValues(buff, lineNum); if (entry != nullptr) { lexicon->Add(entry); diff --git a/src/LexiconAnnotationTest.cpp b/src/LexiconAnnotationTest.cpp new file mode 100644 index 00000000..9a985b29 --- /dev/null +++ b/src/LexiconAnnotationTest.cpp @@ -0,0 +1,179 @@ +/* + * Open Chinese Convert (OpenCC) LexiconAnnotationTest + * + * Copyright 2026 Frank Lin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Lexicon.hpp" +#include "SerializableDict.hpp" +#include "TestUtils.hpp" +#include "TestUtilsUTF8.hpp" +#include "TextDict.hpp" + +namespace opencc { + +class LexiconAnnotationTest : public ::testing::Test { +protected: + const std::string testFileName = "test_annotation_dict.txt"; + + void TearDown() override { remove(testFileName.c_str()); } +}; + +TEST_F(LexiconAnnotationTest, ParseCommentLines) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "# This is a header comment\n"); + fprintf(fp, "# Line 2 of header\n"); + fprintf(fp, "\n"); + fprintf(fp, "A\tB\n"); + fprintf(fp, "C\tD\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + EXPECT_EQ(dict->GetLexicon()->Length(), 2); +} + +TEST_F(LexiconAnnotationTest, ParseAttachedComment) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "# Header\n"); + fprintf(fp, "\n"); + fprintf(fp, "# Comment for A\n"); + fprintf(fp, "A\tB\n"); + fprintf(fp, "C\tD\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + EXPECT_EQ(dict->GetLexicon()->Length(), 2); +} + +TEST_F(LexiconAnnotationTest, ParseFloatingComment) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "A\tB\n"); + fprintf(fp, "\n"); + fprintf(fp, "# This is a floating comment\n"); + fprintf(fp, "\n"); + fprintf(fp, "C\tD\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + EXPECT_EQ(dict->GetLexicon()->Length(), 2); +} + +TEST_F(LexiconAnnotationTest, ParseFooterComment) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "A\tB\n"); + fprintf(fp, "C\tD\n"); + fprintf(fp, "\n"); + fprintf(fp, "# Footer comment\n"); + fprintf(fp, "# Line 2 of footer\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + EXPECT_EQ(dict->GetLexicon()->Length(), 2); +} + +TEST_F(LexiconAnnotationTest, SerializeIgnoresComments) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "# Header\n"); + fprintf(fp, "\n"); + fprintf(fp, "# Comment for B\n"); + fprintf(fp, "B\tBB\n"); + fprintf(fp, "A\tAA\n"); + fprintf(fp, "\n"); + fprintf(fp, "# Footer\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + + // Serialize back + const std::string outputFileName = "test_annotation_dict_output.txt"; + FILE* outFp = fopen(outputFileName.c_str(), "w"); + dict->SerializeToFile(outFp); + fclose(outFp); + + // Read back and verify + FILE* outputFp = fopen(outputFileName.c_str(), "r"); + char buff[1024]; + std::vector lines; + while (fgets(buff, sizeof(buff), outputFp)) { + std::string line(buff); + while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) { + line.pop_back(); + } + lines.push_back(line); + } + fclose(outputFp); + remove(outputFileName.c_str()); + + EXPECT_EQ(lines.size(), 2); + EXPECT_EQ(lines[0], "A\tAA"); + EXPECT_EQ(lines[1], "B\tBB"); +} + +TEST_F(LexiconAnnotationTest, SortIgnoresComments) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "# Header\n"); + fprintf(fp, "\n"); + fprintf(fp, "# Comment for C\n"); + fprintf(fp, "C\tCC\n"); + fprintf(fp, "# Comment for A\n"); + fprintf(fp, "A\tAA\n"); + fprintf(fp, "B\tBB\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + + EXPECT_EQ(dict->GetLexicon()->Length(), 3); +} + +TEST_F(LexiconAnnotationTest, DefaultBehaviorIgnoresComments) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "A\tB\n"); + fprintf(fp, "C\tD\n"); + fclose(fp); + + // Default behavior should ignore comments + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + + EXPECT_EQ(dict->GetLexicon()->Length(), 2); +} + +TEST_F(LexiconAnnotationTest, DefaultBehaviorAcceptsCommentLines) { + FILE* fp = fopen(testFileName.c_str(), "w"); + fprintf(fp, "# This is a comment\n"); + fprintf(fp, "A\tB\n"); + fclose(fp); + + FILE* readFp = fopen(testFileName.c_str(), "r"); + const TextDictPtr& dict = TextDict::NewFromFile(readFp); + fclose(readFp); + + EXPECT_EQ(dict->GetLexicon()->Length(), 1); +} + +} // namespace opencc