allow tab character to be used in user_defined_symbols.
authorTaku Kudo <taku@google.com>
Sun, 12 Jun 2022 18:20:23 +0000 (03:20 +0900)
committerKentaro Hayashi <kenhys@xdump.org>
Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>
Gbp-Pq: Name 0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch

src/trainer_interface.cc
src/util.cc

index ef0c3704412442ca167562f1349da3556a8dda89..5e26b7521c944a8e16d44048ddb0511a20274994 100644 (file)
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.!
 
+#include "trainer_interface.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
@@ -35,7 +37,6 @@
 #include "third_party/absl/strings/str_format.h"
 #include "third_party/absl/strings/str_join.h"
 #include "third_party/absl/strings/str_split.h"
-#include "trainer_interface.h"
 #include "unicode_script.h"
 #include "util.h"
 
@@ -699,6 +700,14 @@ util::Status TrainerInterface::SaveVocab(absl::string_view filename) const {
   auto output = filesystem::NewWritableFile(filename);
   RETURN_IF_ERROR(output->status());
 
+  for (const auto &piece : model_proto.pieces()) {
+    if (piece.piece().find_first_of(" \t\r\n") != std::string::npos) {
+      LOG(WARNING) << "The piece [" << piece.piece()
+                   << "] contains escaped characters that break the format of "
+                   << filename;
+    }
+  }
+
   if (trainer_spec_.vocabulary_output_piece_score()) {
     for (const auto &piece : model_proto.pieces()) {
       std::ostringstream os;
index 8424448c44e691c92399628d134620af70fdabb8..8da16c42e60806c3f551390d8061364f7cdd234f 100644 (file)
 // See the License for the specific language governing permissions and
 // limitations under the License.!
 
-#include <iostream>
-
 #include "util.h"
 
+#include <iostream>
+
 namespace sentencepiece {
 
 namespace {
@@ -217,7 +217,6 @@ std::vector<std::string> StrSplitAsCSV(absl::string_view text) {
 
   std::vector<std::string> result;
   for (; str < eos; ++str) {
-    while (*str == ' ' || *str == '\t') ++str;
     if (*str == '"') {
       start = ++str;
       end = start;