From: Taku Kudo Date: Sun, 12 Jun 2022 18:20:23 +0000 (+0900) Subject: allow tab character to be used in user_defined_symbols. X-Git-Tag: archive/raspbian/0.1.97-3+rpi1^2~25 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=3a74b266c43ef1aab88dbb59a3e1d40a43d659a8;p=sentencepiece.git allow tab character to be used in user_defined_symbols. Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch --- diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc index ef0c370..5e26b75 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License.! +#include "trainer_interface.h" + #include #include #include @@ -35,7 +37,6 @@ #include "third_party/absl/strings/str_format.h" #include "third_party/absl/strings/str_join.h" #include "third_party/absl/strings/str_split.h" -#include "trainer_interface.h" #include "unicode_script.h" #include "util.h" @@ -699,6 +700,14 @@ util::Status TrainerInterface::SaveVocab(absl::string_view filename) const { auto output = filesystem::NewWritableFile(filename); RETURN_IF_ERROR(output->status()); + for (const auto &piece : model_proto.pieces()) { + if (piece.piece().find_first_of(" \t\r\n") != std::string::npos) { + LOG(WARNING) << "The piece [" << piece.piece() + << "] contains escaped characters that break the format of " + << filename; + } + } + if (trainer_spec_.vocabulary_output_piece_score()) { for (const auto &piece : model_proto.pieces()) { std::ostringstream os; diff --git a/src/util.cc b/src/util.cc index 8424448..8da16c4 100644 --- a/src/util.cc +++ b/src/util.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License.! -#include - #include "util.h" +#include + namespace sentencepiece { namespace { @@ -217,7 +217,6 @@ std::vector StrSplitAsCSV(absl::string_view text) { std::vector result; for (; str < eos; ++str) { - while (*str == ' ' || *str == '\t') ++str; if (*str == '"') { start = ++str; end = start;