From 97e13679fd49c3501198a079d3bc2cfae442ffc7 Mon Sep 17 00:00:00 2001
From: Taku Kudo <taku@google.com>
Date: Fri, 5 Aug 2022 19:05:52 +0900
Subject: [PATCH] Updated the document

Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>

Gbp-Pq: Name 0020-Updated-the-document.patch
---
 README.md                                |   1 -
 doc/api.md                               |  22 +--
 doc/options.md                           | 102 +++++++-------
 python/README.md                         | 168 ++++++++++-------------
 python/src/sentencepiece/__init__.py     |  22 ++-
 python/src/sentencepiece/sentencepiece.i |  22 ++-
 python/test/sentencepiece_test.py        |  20 ++-
 7 files changed, 199 insertions(+), 158 deletions(-)

diff --git a/README.md b/README.md
index dc71b64..1986047 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,5 @@ Then segment train/test corpus with ```--vocabulary``` option
 * [Use custom text normalization rules](doc/normalization.md)
 * [Use custom symbols](doc/special_symbols.md)
 * [Python Module](python/README.md)
-* [TensorFlow Module](tensorflow/README.md)
 * [Segmentation and training algorithms in detail]
 
diff --git a/doc/api.md b/doc/api.md
index 797074c..ebde880 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -14,9 +14,9 @@ if (!status.ok()) {
    // error
 }
 
-// You can also load a model from std::ifstream.
-// std::ifstream in("//path/to/model.model");
-// auto status = processor.Load(in);
+// You can also load a serialized model from std::string.
+// const std::stirng str = // Load blob contents from a file.
+// auto status = processor.LoadFromSerializedProto(str);
 ```
 
 ## Tokenize text (preprocessing)
@@ -75,16 +75,20 @@ Calls `SentencePieceTrainer::Train` function to train sentencepiece model. You c
 sentencepiece::SentencePieceTrainer::Train("--input=test/botchan.txt --model_prefix=m --vocab_size=1000");
 ```
 
-## SentencePieceText proto
-You will want to use `SentencePieceText` class to obtain the pieces and ids at the same time. This proto also encodes a utf8-byte offset of each piece over user input or detokenized text.
+## ImmutableSentencePieceText
+You will want to use `ImmutableSentencePieceText` class to obtain the pieces and ids at the same time.
+This proto also encodes a utf8-byte offset of each piece over user input or detokenized text.
 
 ```C++
-#include <sentencepiece.pb.h>
+#include <sentencepiece_processor.h>
 
-sentencepiece::SentencePieceText spt;
+sentencepiece::ImmutableSentencePieceText spt;
 
 // Encode
-processor.Encode("This is a test.", &spt);
+processor.Encode("This is a test.", spt.mutable_proto());
+
+// or
+// spt = processor.EncodeAsImmutableProto("This is a test.");
 
 std::cout << spt.text() << std::endl;   // This is the same as the input.
 for (const auto &piece : spt.pieces()) {
@@ -96,7 +100,7 @@ for (const auto &piece : spt.pieces()) {
 }
 
 // Decode
-processor.Decode({10, 20, 30}, &spt);
+processor.Decode({10, 20, 30}, spt.mutable_proto());
 std::cout << spt.text() << std::endl;   // This is the same as the decoded string.
 for (const auto &piece : spt.pieces()) {
    // the same as above.
diff --git a/doc/options.md b/doc/options.md
index 26cf681..6cdc0f9 100644
--- a/doc/options.md
+++ b/doc/options.md
@@ -3,50 +3,60 @@
 The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here.
 
 ```
---help (show help)  type: bool default: false
---version (show version)  type: bool default: false
---minloglevel (Messages logged at a lower level than this don't actually get logged anywhere)  type: int default: 0
---input (comma separated list of input sentences)  type: std::string default: ""
---input_format (Input format. Supported format is `text` or `tsv`.)  type: std::string default: ""
---model_prefix (output model prefix)  type: std::string default: ""
---model_type (model algorithm: unigram, bpe, word or char)  type: std::string default: "unigram"
---vocab_size (vocabulary size)  type: int32 default: 8000
---accept_language (comma-separated list of languages this model can accept)  type: std::string default: ""
---self_test_sample_size (the size of self test samples)  type: int32 default: 0
---character_coverage (character coverage to determine the minimum symbols)  type: double default: 0.9995
---input_sentence_size (maximum size of sentences the trainer loads)  type: int32 default: 0
---shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0)  type: bool default: true
---seed_sentencepiece_size (the size of seed sentencepieces)  type: int32 default: 1000000
---shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss)  type: double default: 0.75
---num_threads (number of threads for training)  type: int32 default: 16
---num_sub_iterations (number of EM sub-iterations)  type: int32 default: 2
---max_sentencepiece_length (maximum length of sentence piece)  type: int32 default: 16
---max_sentence_length (maximum length of sentence in byte)  type: int32 default: 4192
---split_by_unicode_script (use Unicode script to split sentence pieces)  type: bool default: true
---split_by_number (split tokens by numbers (0-9))  type: bool default: true
---split_by_whitespace (use a white space to split sentence pieces)  type: bool default: true
---split_digits (split all digits (0-9) into separate pieces)  type: bool default: false
---treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.)  type: bool default: false
---control_symbols (comma separated list of control symbols)  type: std::string default: ""
---user_defined_symbols (comma separated list of user defined symbols)  type: std::string default: ""
---required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage)  type: std::string default: ""
---byte_fallback (decompose unknown pieces into UTF-8 byte pieces)  type: bool default: false
---vocabulary_output_piece_score (Define score in vocab file)  type: bool default: true
---normalization_rule_name (Normalization rule name. Choose from nfkc or identity)  type: std::string default: "nmt_nfkc"
---normalization_rule_tsv (Normalization rule TSV file. )  type: std::string default: ""
---denormalization_rule_tsv (Denormalization rule TSV file.)  type: std::string default: ""
---add_dummy_prefix (Add dummy whitespace at the beginning of text)  type: bool default: true
---remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace)  type: bool default: true
---hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.)  type: bool default: true
---use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.)  type: bool default: false
---unk_id (Override UNK (<unk>) id.)  type: int32 default: 0
---bos_id (Override BOS (<s>) id. Set -1 to disable BOS.)  type: int32 default: 1
---eos_id (Override EOS (</s>) id. Set -1 to disable EOS.)  type: int32 default: 2
---pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.)  type: int32 default: -1
---unk_piece (Override UNK (<unk>) piece.)  type: std::string default: "<unk>"
---bos_piece (Override BOS (<s>) piece.)  type: std::string default: "<s>"
---eos_piece (Override EOS (</s>) piece.)  type: std::string default: "</s>"
---pad_piece (Override PAD (<pad>) piece.)  type: std::string default: "<pad>"
---unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.)  type: std::string default: " â "
---train_extremely_large_corpus (Increase bit depth for unigram tokenization.)  type: bool default: false
+Usage: ../build/src/spm_train [options] files
+
+   --input (comma separated list of input sentences)  type: std::string default: ""
+   --input_format (Input format. Supported format is `text` or `tsv`.)  type: std::string default: ""
+   --model_prefix (output model prefix)  type: std::string default: ""
+   --model_type (model algorithm: unigram, bpe, word or char)  type: std::string default: "unigram"
+   --vocab_size (vocabulary size)  type: int32 default: 8000
+   --accept_language (comma-separated list of languages this model can accept)  type: std::string default: ""
+   --self_test_sample_size (the size of self test samples)  type: int32 default: 0
+   --character_coverage (character coverage to determine the minimum symbols)  type: double default: 0.9995
+   --input_sentence_size (maximum size of sentences the trainer loads)  type: std::uint64_t default: 0
+   --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0)  type: bool default: true
+   --seed_sentencepiece_size (the size of seed sentencepieces)  type: int32 default: 1000000
+   --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss)  type: double default: 0.75
+   --num_threads (number of threads for training)  type: int32 default: 16
+   --num_sub_iterations (number of EM sub-iterations)  type: int32 default: 2
+   --max_sentencepiece_length (maximum length of sentence piece)  type: int32 default: 16
+   --max_sentence_length (maximum length of sentence in byte)  type: int32 default: 4192
+   --split_by_unicode_script (use Unicode script to split sentence pieces)  type: bool default: true
+   --split_by_number (split tokens by numbers (0-9))  type: bool default: true
+   --split_by_whitespace (use a white space to split sentence pieces)  type: bool default: true
+   --split_digits (split all digits (0-9) into separate pieces)  type: bool default: false
+   --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.)  type: bool default: false
+   --allow_whitespace_only_pieces (allow pieces that only contain (consecutive) whitespace tokens)  type: bool default: false
+   --control_symbols (comma separated list of control symbols)  type: std::string default: ""
+   --control_symbols_file (load control_symbols from file.)  type: std::string default: ""
+   --user_defined_symbols (comma separated list of user defined symbols)  type: std::string default: ""
+   --user_defined_symbols_file (load user_defined_symbols from file.)  type: std::string default: ""
+   --required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage)  type: std::string default: ""
+   --required_chars_file (load required_chars from file.)  type: std::string default: ""
+   --byte_fallback (decompose unknown pieces into UTF-8 byte pieces)  type: bool default: false
+   --vocabulary_output_piece_score (Define score in vocab file)  type: bool default: true
+   --normalization_rule_name (Normalization rule name. Choose from nfkc or identity)  type: std::string default: "nmt_nfkc"
+   --normalization_rule_tsv (Normalization rule TSV file. )  type: std::string default: ""
+   --denormalization_rule_tsv (Denormalization rule TSV file.)  type: std::string default: ""
+   --add_dummy_prefix (Add dummy whitespace at the beginning of text)  type: bool default: true
+   --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace)  type: bool default: true
+   --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.)  type: bool default: true
+   --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.)  type: bool default: false
+   --unk_id (Override UNK (<unk>) id.)  type: int32 default: 0
+   --bos_id (Override BOS (<s>) id. Set -1 to disable BOS.)  type: int32 default: 1
+   --eos_id (Override EOS (</s>) id. Set -1 to disable EOS.)  type: int32 default: 2
+   --pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.)  type: int32 default: -1
+   --unk_piece (Override UNK (<unk>) piece.)  type: std::string default: "<unk>"
+   --bos_piece (Override BOS (<s>) piece.)  type: std::string default: "<s>"
+   --eos_piece (Override EOS (</s>) piece.)  type: std::string default: "</s>"
+   --pad_piece (Override PAD (<pad>) piece.)  type: std::string default: "<pad>"
+   --unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.)  type: std::string default: " â "
+   --train_extremely_large_corpus (Increase bit depth for unigram tokenization.)  type: bool default: false
+   --random_seed (Seed value for random generator.)  type: uint32 default: 4294967295
+   --enable_differential_privacy (Whether to add DP while training. Currently supported only by UNIGRAM model.)  type: bool default: false
+   --differential_privacy_noise_level (Amount of noise to add for DP)  type: float default: 0
+   --differential_privacy_clipping_threshold (Threshold for clipping the counts for DP)  type: std::uint64_t default: 0
+   --help (show help)  type: bool default: false
+   --version (show version)  type: bool default: false
+   --minloglevel (Messages logged at a lower level than this don't actually get logged anywhere)  type: int default: 0
 ```
diff --git a/python/README.md b/python/README.md
index b683082..bc5a59a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -9,10 +9,17 @@ For Linux (x64/i686), macOS, and Windows(win32/x64) environment, you can simply
 % pip install sentencepiece
 ```
 
-To build and install the Python wrapper from source, please install [SentencePiece C++](https://github.com/google/sentencepiece#c-from-source) and try the following commands:
+To build and install the Python wrapper from source, try the following commands to build and install wheel package.
 ```
-% python setup.py build
-% sudo python setup.py install
+% git clone https://github.com/google/sentencepiece.git 
+% cd sentencepiece
+% mkdir build
+% cd build
+% cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root
+% make install
+% cd ../python
+% python setup.py bdist_wheel
+% pip install dist/sentencepiece*.whl
 ```
 
 If you donât have write permission to the global site-packages directory or donât want to install into it, please try:
@@ -22,21 +29,50 @@ If you donât have write permission to the global site-packages directory or do
 
 ## Usage
 
-See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively. (Note: this sample is written in old interface.)
+See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively.
 
 ### Segmentation
 ```
 % python
 >>> import sentencepiece as spm
 >>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model')
+
 >>> sp.encode('This is a test')
 [284, 47, 11, 4, 15, 400]
+
 >>> sp.encode(['This is a test', 'Hello world'], out_type=int)
 [[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
+
+>>> sp.encode_as_ids(['This is a test', 'Hello world'])
+[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
+
 >>> sp.encode('This is a test', out_type=str)
 ['âThis', 'âis', 'âa', 'â', 't', 'est']
+
 >>> sp.encode(['This is a test', 'Hello world'], out_type=str)
 [['âThis', 'âis', 'âa', 'â', 't', 'est'], ['âHe', 'll', 'o', 'âworld']]
+
+>>> sp.encode_as_pieces(['This is a test', 'Hello world'])
+[['âThis', 'âis', 'âa', 'â', 't', 'est'], ['âHe', 'll', 'o', 'âworld']]
+
+>>> proto = sp.encode('This is a test', out_type='immutable_proto')
+>>> for n in proto.pieces:
+...     print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end))
+... 
+piece="âThis" surface="This" id=284 begin=0 end=4
+piece="âis" surface=" is" id=47 begin=4 end=7
+piece="âa" surface=" a" id=11 begin=7 end=9
+piece="â" surface=" " id=4 begin=9 end=10
+piece="t" surface="t" id=15 begin=10 end=11
+piece="est" surface="est" id=400 begin=11 end=14
+
+>>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]]
+[[284, 47, 11, 4, 15, 400], ['âThis', 'âis', 'âa', 'â', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]]
+
+>>> proto2 = sp.encode_as_immutable_proto('This is a test')
+>>> proto2 == proto
+True
+
 >>> for _ in range(10):
 ...     sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
 ... 
@@ -50,26 +86,55 @@ See [this google colab page](https://github.com/google/sentencepiece/blob/master
 ['â', 'T', 'h', 'is', 'â', 'is', 'â', 'a', 'â', 'te', 'st']
 ['â', 'This', 'â', 'i', 's', 'âa', 'â', 't', 'e', 'st']
 ['âThis', 'â', 'is', 'âa', 'â', 't', 'est']
+
+>> sp.nbest_encode('This is a test', nbest_size=5, out_type=str)
+[['âThis', 'âis', 'âa', 'â', 't', 'est'], 
+['âThis', 'âis', 'âa', 'â', 'te', 'st'], 
+['âThis', 'âis', 'âa', 'â', 'te', 's', 't'],
+['âThis', 'âis', 'âa', 'â', 't', 'e', 'st'],
+['âThis', 'âis', 'âa', 'â', 't', 'es', 't']]
+
+>>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True)
+[(['âThis', 'â', 'i', 's', 'âa', 'â', 'te', 's', 't'], -3.043105125427246),
+(['âThis', 'â', 'i', 's', 'âa', 'â', 'te', 'st'], -2.8475849628448486),
+(['â', 'This', 'âis', 'â', 'a', 'â', 'te', 'st'], -3.043248176574707),
+(['â', 'This', 'âis', 'âa', 'â', 't', 'e', 'st'], -2.87727689743042),
+(['â', 'This', 'â', 'i', 's', 'â', 'a', 'â', 't', 'est'], -3.6284031867980957)]
+
 >>> sp.decode([284, 47, 11, 4, 15, 400])
 'This is a test'
+
 >>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]])
 ['This is a test', 'Hello world']
+
+>>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto') 
+>>> proto.text
+'This is a test'
+
 >>> sp.decode(['â', 'This', 'â', 'is', 'âa', 'â', 't', 'e', 'st'])
 'This is a test'
+
 >>> sp.decode([['âThis', 'âis', 'âa', 'â', 't', 'est'], ['âHe', 'll', 'o', 'âworld']])
 ['This is a test', 'Hello world']
+
 >>> sp.get_piece_size()
 1000
+
 >>> sp.id_to_piece(2)
 '</s>'
+
 >>> sp.id_to_piece([2, 3, 4])
 ['</s>', '\r', 'â']
+
 >>> sp.piece_to_id('<s>')
 1
+
 >>> sp.piece_to_id(['</s>', '\r', 'â'])
 [2, 3, 4]
+
 >>> len(sp)
 1000
+
 >>> sp['</s>']
 2
 ```
@@ -116,98 +181,3 @@ with urllib.request.urlopen(
 sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
 print(sp.encode('this is test'))
 ```
-
-
-### Segmentation (old interface)
-```
-% python
->>> import sentencepiece as spm
->>> sp = spm.SentencePieceProcessor()
->>> sp.Load("test/test_model.model")
-True
->>> sp.EncodeAsPieces("This is a test")
-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']
->>> sp.EncodeAsIds("This is a test")
-[284, 47, 11, 4, 15, 400]
->>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
-'This is a test'
->>> sp.NBestEncodeAsPieces("This is a test", 5)
-[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']]
->>> for x in range(10):
-...     sp.SampleEncodeAsPieces("This is a test", -1, 0.1)
-...
-['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
-['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est']
-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
-['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
-['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
-['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
-['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
-['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't']
->>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
-'This is a test'
->>> sp.GetPieceSize()
-1000
->>> sp.IdToPiece(2)
-'</s>'
->>> sp.PieceToId('</s>')
-2
->>> len(sp)
-1000
->>> sp['</s>']
-2
-```
-
-### Model Training (old interface)
-Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to  SentencePieceTrainer.Train() function.
-
-```
->>> import sentencepiece as spm
->>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000')
-unigram_model_trainer.cc(494) LOG(INFO) Starts training with : 
-input: "test/botchan.txt"
-model_prefix: "m"
-model_type: UNIGRAM
-..snip..
-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623
-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623
-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118
-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
-trainer_interface.cc(284) LOG(INFO) Saving model: m.model
-trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab
->>>
-```
-
-## Python2/3 String/Unicode compatibility
-Sentencepiece python wrapper accepts both Unicode string and legacy byte string.
-The output string type is determined by the input string type.
-The output type of IdToPiece/DecodeIds methods is *str*, but note that it is a legacy byte string in Python2 and Unicode string in Python3 respectively.
-
-* Python2:
-```
->>> sp.EncodeAsPieces('å¾è¼©ã¯ç«ã§ãã')
-['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b']
->>> sp.EncodeAsPieces(u'å¾è¼©ã¯ç«ã§ãã')
-[u'\u2581', u'\u543e', u'\u8f29', u'\u306f', u'\u732b', u'\u3067\u3042\u308b']
->>> sp.EncodeAsPieces(u'å¾è¼©ã¯ç«ã§ãã'.encode('utf-8'))
-['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b']
->>> sp.IdToPiece(10)
-'\xe3\x81\xab'
->>> type(sp.IdToPiece(10))
-<type 'str'>
-```
-
-* Python3:
-```
->>> sp.EncodeAsPieces('å¾è¼©ã¯ç«ã§ãã')
-['â', 'å¾', 'è¼©', 'ã¯', 'ç«', 'ã§ãã']
->>> sp.EncodeAsPieces('å¾è¼©ã¯ç«ã§ãã'.encode('utf-8'))
-[b'\xe2\x96\x81', b'\xe5\x90\xbe', b'\xe8\xbc\xa9', b'\xe3\x81\xaf', b'\xe7\x8c\xab', b'\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b']
->>>
->>> sp.IdToPiece(10)
-'ã«'
->>> type(sp.IdToPiece(10))
-<class 'str'>
-```
diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py
index cf06830..911a2cb 100644
--- a/python/src/sentencepiece/__init__.py
+++ b/python/src/sentencepiece/__init__.py
@@ -635,7 +635,7 @@ class SentencePieceProcessor(object):
       return _encode(input)
 
 
-    def NBestEncodeAsPieces(self, input, nbest_size=None,  **kwargs):
+    def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
       return self.NBestEncode(input=input, nbest_size=nbest_size,
                               out_type=str, **kwargs)
 
@@ -732,6 +732,26 @@ class SentencePieceProcessor(object):
       return _encode(input)
 
 
+    def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
+      return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                       out_type=str, **kwargs)
+
+
+    def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
+      return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                       out_type=int, **kwargs)
+
+
+    def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
+      return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                       out_type='serialized_proto', **kwargs)
+
+
+    def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
+      return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                       out_type='immutable_proto', **kwargs)
+
+
     def Decode(self, input, out_type=str, num_threads=None):
       """Decode processed id or token sequences.
 
diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i
index 2ac68a8..fc773e2 100644
--- a/python/src/sentencepiece/sentencepiece.i
+++ b/python/src/sentencepiece/sentencepiece.i
@@ -903,7 +903,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     return _encode(input)
 
 
-  def NBestEncodeAsPieces(self, input, nbest_size=None,  **kwargs):
+  def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
     return self.NBestEncode(input=input, nbest_size=nbest_size,
                             out_type=str, **kwargs)
 
@@ -1000,6 +1000,26 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     return _encode(input)
 
 
+  def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
+    return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                     out_type=str, **kwargs)
+
+
+  def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
+    return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                     out_type=int, **kwargs)
+
+
+  def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
+    return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                     out_type='serialized_proto', **kwargs)
+        
+
+  def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
+    return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
+                                     out_type='immutable_proto', **kwargs)
+          
+
   def Decode(self, input, out_type=str, num_threads=None):
     """Decode processed id or token sequences.
 
diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py
index 92327ac..2b9ad28 100755
--- a/python/test/sentencepiece_test.py
+++ b/python/test/sentencepiece_test.py
@@ -566,7 +566,7 @@ class TestSentencepieceProcessor(unittest.TestCase):
         for n in sp.decode(results):
           self.assertEqual(n, text)
 
-    # batch test
+      # batch test
       results = sp.nbest_encode([text, text2], nbest_size=10, out_type=out_type)
       self.assertEqual(
           results,
@@ -589,6 +589,19 @@ class TestSentencepieceProcessor(unittest.TestCase):
         for n in decoded:
           self.assertEqual(n, text2)
 
+    self.assertEqual(
+        sp.nbest_encode(text, nbest_size=10, out_type=str),
+        sp.nbest_encode_as_pieces(text, nbest_size=10))
+    self.assertEqual(
+        sp.nbest_encode(text, nbest_size=10, out_type=int),
+        sp.nbest_encode_as_ids(text, nbest_size=10))
+    self.assertEqual(
+        sp.nbest_encode(text, nbest_size=10, out_type='serialized_proto'),
+        sp.nbest_encode_as_serialized_proto(text, nbest_size=10))
+    self.assertEqual(
+        sp.nbest_encode(text, nbest_size=10, out_type='immutable_proto'),
+        sp.nbest_encode_as_immutable_proto(text, nbest_size=10))
+
   def test_sample_and_score(self):
     sp = self.sp_
     text = 'hello world'
@@ -618,6 +631,11 @@ class TestSentencepieceProcessor(unittest.TestCase):
         for n in results[1]:
           self.assertEqual(sp.decode(n[0]), text2)
 
+    sp.sample_encode_and_score_as_pieces(text, 10)
+    sp.sample_encode_and_score_as_ids(text, 10)
+    sp.sample_encode_and_score_as_immutable_proto(text, 10)
+    sp.sample_encode_and_score_as_serialized_proto(text, 10)
+
   def test_valid_range(self):
     size = self.sp_.piece_size()
     funcs = [
-- 
2.30.2