add test to use tab as user defined symbols..

author Taku Kudo <taku@google.com>

Mon, 13 Jun 2022 07:46:18 +0000 (16:46 +0900)

committer Kentaro Hayashi <kenhys@xdump.org>

Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
author Taku Kudo <taku@google.com>
Mon, 13 Jun 2022 07:46:18 +0000 (16:46 +0900)
committer Kentaro Hayashi <kenhys@xdump.org>
Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py

index 99e36f3ac0f7fb8eed41249f97326a463c5fd113..6c48bcd9b55e524235d4ff54fbe0ad40294dd20f 100755 (executable)
--- a/python/test/sentencepiece_test.py
+++ b/python/test/sentencepiece_test.py
@@ -240,16 +240,18 @@ class TestSentencepieceProcessor(unittest.TestCase):
          input=[os.path.join(data_dir, 'botchan.txt')],
          model_prefix='m',
          vocab_size=1002,
-        user_defined_symbols=['foo', 'bar', ','],
+        user_defined_symbols=['foo', 'bar', ',', ' ', '\t', '\b', '\n', '\r'],
          logstream=open(os.devnull, 'w'))
      sp = spm.SentencePieceProcessor()
      sp.Load('m.model')
-    with open(
-        os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file:
+    with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file:
        for line in file:
          sp.DecodePieces(sp.EncodeAsPieces(line))
          sp.DecodeIds(sp.EncodeAsIds(line))
  
+    s = 'hello\tworld\r\nthis\tis a \b pen'
+    self.assertEqual(s, sp.decode(sp.encode(s)))
+
    def test_serialized_proto(self):
      text = 'I saw a girl with a telescope.'
      s1 = self.sp_.EncodeAsSerializedProto(text)
@@ -419,8 +421,7 @@ class TestSentencepieceProcessor(unittest.TestCase):
    def test_batch(self):
      sp = spm.SentencePieceProcessor(
          model_file=os.path.join('test', 'test_model.model'))
-    with open(
-        os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file:
+    with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file:
        texts = file.readlines()
  
      r1 = sp.encode(texts, out_type=str, num_threads=None)
author	Taku Kudo <taku@google.com>
	Mon, 13 Jun 2022 07:46:18 +0000 (16:46 +0900)
committer	Kentaro Hayashi <kenhys@xdump.org>
	Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)