From: Taku Kudo Date: Mon, 13 Jun 2022 07:46:18 +0000 (+0900) Subject: add test to use tab as user defined symbols.. X-Git-Tag: archive/raspbian/0.1.97-3+rpi1^2~24 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=0bd72de5b3f108bd201fa3c02bc4939a70000d9c;p=sentencepiece.git add test to use tab as user defined symbols.. Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0004-add-test-to-use-tab-as-user-defined-symbols.patch --- diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index 99e36f3..6c48bcd 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -240,16 +240,18 @@ class TestSentencepieceProcessor(unittest.TestCase): input=[os.path.join(data_dir, 'botchan.txt')], model_prefix='m', vocab_size=1002, - user_defined_symbols=['foo', 'bar', ','], + user_defined_symbols=['foo', 'bar', ',', ' ', '\t', '\b', '\n', '\r'], logstream=open(os.devnull, 'w')) sp = spm.SentencePieceProcessor() sp.Load('m.model') - with open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: for line in file: sp.DecodePieces(sp.EncodeAsPieces(line)) sp.DecodeIds(sp.EncodeAsIds(line)) + s = 'hello\tworld\r\nthis\tis a \b pen' + self.assertEqual(s, sp.decode(sp.encode(s))) + def test_serialized_proto(self): text = 'I saw a girl with a telescope.' s1 = self.sp_.EncodeAsSerializedProto(text) @@ -419,8 +421,7 @@ class TestSentencepieceProcessor(unittest.TestCase): def test_batch(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model')) - with open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: texts = file.readlines() r1 = sp.encode(texts, out_type=str, num_threads=None)