From 0bd72de5b3f108bd201fa3c02bc4939a70000d9c Mon Sep 17 00:00:00 2001 From: Taku Kudo Date: Mon, 13 Jun 2022 16:46:18 +0900 Subject: [PATCH] add test to use tab as user defined symbols.. Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0004-add-test-to-use-tab-as-user-defined-symbols.patch --- python/test/sentencepiece_test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index 99e36f3..6c48bcd 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -240,16 +240,18 @@ class TestSentencepieceProcessor(unittest.TestCase): input=[os.path.join(data_dir, 'botchan.txt')], model_prefix='m', vocab_size=1002, - user_defined_symbols=['foo', 'bar', ','], + user_defined_symbols=['foo', 'bar', ',', ' ', '\t', '\b', '\n', '\r'], logstream=open(os.devnull, 'w')) sp = spm.SentencePieceProcessor() sp.Load('m.model') - with open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: for line in file: sp.DecodePieces(sp.EncodeAsPieces(line)) sp.DecodeIds(sp.EncodeAsIds(line)) + s = 'hello\tworld\r\nthis\tis a \b pen' + self.assertEqual(s, sp.decode(sp.encode(s))) + def test_serialized_proto(self): text = 'I saw a girl with a telescope.' s1 = self.sp_.EncodeAsSerializedProto(text) @@ -419,8 +421,7 @@ class TestSentencepieceProcessor(unittest.TestCase): def test_batch(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model')) - with open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: texts = file.readlines() r1 = sp.encode(texts, out_type=str, num_threads=None) -- 2.30.2