From cbc353f7449ced19d2fb65fa8aa7a90a452f8110 Mon Sep 17 00:00:00 2001 From: Carbo Kuo Date: Thu, 25 Feb 2021 21:13:38 +0900 Subject: [PATCH] Fix a severe performance bug in `Conversion::Convert` that caused O(N^2) complexity. In `Conversion.cpp`, line 27: ``` Optional matched = dict->MatchPrefix(pstr); ``` pstr is a `const char*`. However, there is no overloaded function which parameter `const char*`. Therefore it matches `Optional MatchPrefix(const std::string& word) const`. There is an implicit type conversion from `char*` to `std::string` with time complexity O(N). I added new benchmark tests. Before the fix: 1: ------------------------------------------------------------------ 1: Benchmark Time CPU Iterations 1: ------------------------------------------------------------------ 1: BM_Initialization/hk2s 1.17 ms 1.12 ms 645 1: BM_Initialization/hk2t 0.116 ms 0.116 ms 5922 1: BM_Initialization/jp2t 0.206 ms 0.201 ms 3500 1: BM_Initialization/s2hk 18.2 ms 17.9 ms 40 1: BM_Initialization/s2t 18.2 ms 18.1 ms 39 1: BM_Initialization/s2tw 17.9 ms 17.8 ms 39 1: BM_Initialization/s2twp 18.6 ms 18.4 ms 39 1: BM_Initialization/t2hk 0.055 ms 0.054 ms 12907 1: BM_Initialization/t2jp 0.120 ms 0.117 ms 5978 1: BM_Initialization/t2s 0.988 ms 0.984 ms 710 1: BM_Initialization/tw2s 1.08 ms 1.05 ms 672 1: BM_Initialization/tw2sp 1.26 ms 1.24 ms 563 1: BM_Initialization/tw2t 0.088 ms 0.083 ms 8528 1: BM_Convert2M 413 ms 413 ms 2 1: BM_Convert/100 1.09 ms 1.09 ms 629 1: BM_Convert/1000 33.2 ms 33.2 ms 21 1: BM_Convert/10000 2822 ms 2822 ms 1 1: BM_Convert/100000 (took longer than 5 minutes, killed) Now: 1: ------------------------------------------------------------------ 1: Benchmark Time CPU Iterations 1: ------------------------------------------------------------------ 1: BM_Initialization/hk2s 1.07 ms 1.07 ms 650 1: BM_Initialization/hk2t 0.114 ms 0.114 ms 6092 1: BM_Initialization/jp2t 0.204 ms 0.200 ms 3503 1: BM_Initialization/s2hk 18.2 ms 18.0 ms 40 1: BM_Initialization/s2t 17.6 ms 17.6 ms 39 1: BM_Initialization/s2tw 18.0 ms 17.9 ms 40 1: BM_Initialization/s2twp 17.9 ms 17.9 ms 39 1: BM_Initialization/t2hk 0.055 ms 0.055 ms 12855 1: BM_Initialization/t2jp 0.125 ms 0.124 ms 5772 1: BM_Initialization/t2s 1.000 ms 0.989 ms 695 1: BM_Initialization/tw2s 1.09 ms 1.07 ms 668 1: BM_Initialization/tw2sp 1.29 ms 1.26 ms 558 1: BM_Initialization/tw2t 0.082 ms 0.082 ms 8558 1: BM_Convert2M 361 ms 361 ms 2 1: BM_Convert/100 0.741 ms 0.740 ms 948 1: BM_Convert/1000 7.54 ms 7.52 ms 94 1: BM_Convert/10000 76.3 ms 76.3 ms 9 1: BM_Convert/100000 786 ms 786 ms 1 This bug was reported in https://github.com/BYVoid/OpenCC/issues/478 and https://github.com/BYVoid/OpenCC/issues/517. Gbp-Pq: Name 0007-Fix-a-severe-performance-bug-in-Conversion-Convert-t.patch --- src/Dict.hpp | 7 ++++ src/benchmark/Performance.cpp | 77 ++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/Dict.hpp b/src/Dict.hpp index 461d6d2..1c81034 100644 --- a/src/Dict.hpp +++ b/src/Dict.hpp @@ -49,6 +49,13 @@ public: virtual Optional MatchPrefix(const char* word, size_t len) const; + /** + * Matches the longest matched prefix of a word. + */ + Optional MatchPrefix(const char* word) const { + return MatchPrefix(word, KeyMaxLength()); + } + /** * Matches the longest matched prefix of a word. */ diff --git a/src/benchmark/Performance.cpp b/src/benchmark/Performance.cpp index cf8d3aa..d1b6468 100644 --- a/src/benchmark/Performance.cpp +++ b/src/benchmark/Performance.cpp @@ -1,7 +1,26 @@ +/* + * Open Chinese Convert + * + * Copyright 2020-2021 Carbo Kuo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include #include +#include #include #ifdef _MSC_VER @@ -44,21 +63,32 @@ static void BM_Initialization(benchmark::State& state, state.ResumeTiming(); } } -BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s"); -BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t"); -BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t"); -BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk"); -BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t"); -BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw"); -BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp"); -BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk"); -BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp"); -BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s"); -BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s"); -BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp"); -BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t"); +BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t")->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s")->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp") + ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t") + ->Unit(benchmark::kMillisecond); -static void BM_Convert(benchmark::State& state) { +static void BM_Convert2M(benchmark::State& state) { const std::string config_name = "s2t"; const std::string text = ReadText("zuozhuan.txt"); const std::unique_ptr converter(Initialize(config_name)); @@ -66,7 +96,24 @@ static void BM_Convert(benchmark::State& state) { Convert(converter.get(), text); } } -BENCHMARK(BM_Convert)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_Convert2M)->Unit(benchmark::kMillisecond); + +static void BM_Convert(benchmark::State& state, int iteration) { + std::ostringstream os; + for (int i = 0; i < iteration; i++) { + os << "Open Chinese Convert 開放中文轉換" << i << std::endl; + } + const std::string text = os.str(); + const std::string config_name = "s2t"; + const std::unique_ptr converter(Initialize(config_name)); + for (auto _ : state) { + Convert(converter.get(), text); + } +} +BENCHMARK_CAPTURE(BM_Convert, 100, 100)->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Convert, 1000, 1000)->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Convert, 10000, 10000)->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(BM_Convert, 100000, 100000)->Unit(benchmark::kMillisecond); } // namespace opencc -- 2.30.2