From 97e13679fd49c3501198a079d3bc2cfae442ffc7 Mon Sep 17 00:00:00 2001 From: Taku Kudo Date: Fri, 5 Aug 2022 19:05:52 +0900 Subject: [PATCH] Updated the document Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0020-Updated-the-document.patch --- README.md | 1 - doc/api.md | 22 +-- doc/options.md | 102 +++++++------- python/README.md | 168 ++++++++++------------- python/src/sentencepiece/__init__.py | 22 ++- python/src/sentencepiece/sentencepiece.i | 22 ++- python/test/sentencepiece_test.py | 20 ++- 7 files changed, 199 insertions(+), 158 deletions(-) diff --git a/README.md b/README.md index dc71b64..1986047 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,5 @@ Then segment train/test corpus with ```--vocabulary``` option * [Use custom text normalization rules](doc/normalization.md) * [Use custom symbols](doc/special_symbols.md) * [Python Module](python/README.md) -* [TensorFlow Module](tensorflow/README.md) * [Segmentation and training algorithms in detail] diff --git a/doc/api.md b/doc/api.md index 797074c..ebde880 100644 --- a/doc/api.md +++ b/doc/api.md @@ -14,9 +14,9 @@ if (!status.ok()) { // error } -// You can also load a model from std::ifstream. -// std::ifstream in("//path/to/model.model"); -// auto status = processor.Load(in); +// You can also load a serialized model from std::string. +// const std::stirng str = // Load blob contents from a file. +// auto status = processor.LoadFromSerializedProto(str); ``` ## Tokenize text (preprocessing) @@ -75,16 +75,20 @@ Calls `SentencePieceTrainer::Train` function to train sentencepiece model. You c sentencepiece::SentencePieceTrainer::Train("--input=test/botchan.txt --model_prefix=m --vocab_size=1000"); ``` -## SentencePieceText proto -You will want to use `SentencePieceText` class to obtain the pieces and ids at the same time. This proto also encodes a utf8-byte offset of each piece over user input or detokenized text. +## ImmutableSentencePieceText +You will want to use `ImmutableSentencePieceText` class to obtain the pieces and ids at the same time. +This proto also encodes a utf8-byte offset of each piece over user input or detokenized text. ```C++ -#include +#include -sentencepiece::SentencePieceText spt; +sentencepiece::ImmutableSentencePieceText spt; // Encode -processor.Encode("This is a test.", &spt); +processor.Encode("This is a test.", spt.mutable_proto()); + +// or +// spt = processor.EncodeAsImmutableProto("This is a test."); std::cout << spt.text() << std::endl; // This is the same as the input. for (const auto &piece : spt.pieces()) { @@ -96,7 +100,7 @@ for (const auto &piece : spt.pieces()) { } // Decode -processor.Decode({10, 20, 30}, &spt); +processor.Decode({10, 20, 30}, spt.mutable_proto()); std::cout << spt.text() << std::endl; // This is the same as the decoded string. for (const auto &piece : spt.pieces()) { // the same as above. diff --git a/doc/options.md b/doc/options.md index 26cf681..6cdc0f9 100644 --- a/doc/options.md +++ b/doc/options.md @@ -3,50 +3,60 @@ The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here. ``` ---help (show help) type: bool default: false ---version (show version) type: bool default: false ---minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 ---input (comma separated list of input sentences) type: std::string default: "" ---input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" ---model_prefix (output model prefix) type: std::string default: "" ---model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" ---vocab_size (vocabulary size) type: int32 default: 8000 ---accept_language (comma-separated list of languages this model can accept) type: std::string default: "" ---self_test_sample_size (the size of self test samples) type: int32 default: 0 ---character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 ---input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 ---shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true ---seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 ---shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 ---num_threads (number of threads for training) type: int32 default: 16 ---num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 ---max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 ---max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 ---split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true ---split_by_number (split tokens by numbers (0-9)) type: bool default: true ---split_by_whitespace (use a white space to split sentence pieces) type: bool default: true ---split_digits (split all digits (0-9) into separate pieces) type: bool default: false ---treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false ---control_symbols (comma separated list of control symbols) type: std::string default: "" ---user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" ---required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" ---byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false ---vocabulary_output_piece_score (Define score in vocab file) type: bool default: true ---normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" ---normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" ---denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" ---add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true ---remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true ---hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true ---use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false ---unk_id (Override UNK () id.) type: int32 default: 0 ---bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 ---eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 ---pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 ---unk_piece (Override UNK () piece.) type: std::string default: "" ---bos_piece (Override BOS () piece.) type: std::string default: "" ---eos_piece (Override EOS () piece.) type: std::string default: "" ---pad_piece (Override PAD () piece.) type: std::string default: "" ---unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " ---train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false +Usage: ../build/src/spm_train [options] files + + --input (comma separated list of input sentences) type: std::string default: "" + --input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" + --model_prefix (output model prefix) type: std::string default: "" + --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" + --vocab_size (vocabulary size) type: int32 default: 8000 + --accept_language (comma-separated list of languages this model can accept) type: std::string default: "" + --self_test_sample_size (the size of self test samples) type: int32 default: 0 + --character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 + --input_sentence_size (maximum size of sentences the trainer loads) type: std::uint64_t default: 0 + --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true + --seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 + --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 + --num_threads (number of threads for training) type: int32 default: 16 + --num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 + --max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 + --max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 + --split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true + --split_by_number (split tokens by numbers (0-9)) type: bool default: true + --split_by_whitespace (use a white space to split sentence pieces) type: bool default: true + --split_digits (split all digits (0-9) into separate pieces) type: bool default: false + --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false + --allow_whitespace_only_pieces (allow pieces that only contain (consecutive) whitespace tokens) type: bool default: false + --control_symbols (comma separated list of control symbols) type: std::string default: "" + --control_symbols_file (load control_symbols from file.) type: std::string default: "" + --user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" + --user_defined_symbols_file (load user_defined_symbols from file.) type: std::string default: "" + --required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" + --required_chars_file (load required_chars from file.) type: std::string default: "" + --byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false + --vocabulary_output_piece_score (Define score in vocab file) type: bool default: true + --normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" + --normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" + --denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" + --add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true + --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true + --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true + --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false + --unk_id (Override UNK () id.) type: int32 default: 0 + --bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 + --eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 + --pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 + --unk_piece (Override UNK () piece.) type: std::string default: "" + --bos_piece (Override BOS () piece.) type: std::string default: "" + --eos_piece (Override EOS () piece.) type: std::string default: "" + --pad_piece (Override PAD () piece.) type: std::string default: "" + --unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " + --train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false + --random_seed (Seed value for random generator.) type: uint32 default: 4294967295 + --enable_differential_privacy (Whether to add DP while training. Currently supported only by UNIGRAM model.) type: bool default: false + --differential_privacy_noise_level (Amount of noise to add for DP) type: float default: 0 + --differential_privacy_clipping_threshold (Threshold for clipping the counts for DP) type: std::uint64_t default: 0 + --help (show help) type: bool default: false + --version (show version) type: bool default: false + --minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 ``` diff --git a/python/README.md b/python/README.md index b683082..bc5a59a 100644 --- a/python/README.md +++ b/python/README.md @@ -9,10 +9,17 @@ For Linux (x64/i686), macOS, and Windows(win32/x64) environment, you can simply % pip install sentencepiece ``` -To build and install the Python wrapper from source, please install [SentencePiece C++](https://github.com/google/sentencepiece#c-from-source) and try the following commands: +To build and install the Python wrapper from source, try the following commands to build and install wheel package. ``` -% python setup.py build -% sudo python setup.py install +% git clone https://github.com/google/sentencepiece.git +% cd sentencepiece +% mkdir build +% cd build +% cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root +% make install +% cd ../python +% python setup.py bdist_wheel +% pip install dist/sentencepiece*.whl ``` If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try: @@ -22,21 +29,50 @@ If you don’t have write permission to the global site-packages directory or do ## Usage -See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively. (Note: this sample is written in old interface.) +See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively. ### Segmentation ``` % python >>> import sentencepiece as spm >>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model') + >>> sp.encode('This is a test') [284, 47, 11, 4, 15, 400] + >>> sp.encode(['This is a test', 'Hello world'], out_type=int) [[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]] + +>>> sp.encode_as_ids(['This is a test', 'Hello world']) +[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]] + >>> sp.encode('This is a test', out_type=str) ['▁This', '▁is', '▁a', '▁', 't', 'est'] + >>> sp.encode(['This is a test', 'Hello world'], out_type=str) [['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']] + +>>> sp.encode_as_pieces(['This is a test', 'Hello world']) +[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']] + +>>> proto = sp.encode('This is a test', out_type='immutable_proto') +>>> for n in proto.pieces: +... print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end)) +... +piece="▁This" surface="This" id=284 begin=0 end=4 +piece="▁is" surface=" is" id=47 begin=4 end=7 +piece="▁a" surface=" a" id=11 begin=7 end=9 +piece="▁" surface=" " id=4 begin=9 end=10 +piece="t" surface="t" id=15 begin=10 end=11 +piece="est" surface="est" id=400 begin=11 end=14 + +>>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]] +[[284, 47, 11, 4, 15, 400], ['▁This', '▁is', '▁a', '▁', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]] + +>>> proto2 = sp.encode_as_immutable_proto('This is a test') +>>> proto2 == proto +True + >>> for _ in range(10): ... sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1) ... @@ -50,26 +86,55 @@ See [this google colab page](https://github.com/google/sentencepiece/blob/master ['▁', 'T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 'te', 'st'] ['▁', 'This', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st'] ['▁This', '▁', 'is', '▁a', '▁', 't', 'est'] + +>> sp.nbest_encode('This is a test', nbest_size=5, out_type=str) +[['▁This', '▁is', '▁a', '▁', 't', 'est'], +['▁This', '▁is', '▁a', '▁', 'te', 'st'], +['▁This', '▁is', '▁a', '▁', 'te', 's', 't'], +['▁This', '▁is', '▁a', '▁', 't', 'e', 'st'], +['▁This', '▁is', '▁a', '▁', 't', 'es', 't']] + +>>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True) +[(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't'], -3.043105125427246), +(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 'st'], -2.8475849628448486), +(['▁', 'This', '▁is', '▁', 'a', '▁', 'te', 'st'], -3.043248176574707), +(['▁', 'This', '▁is', '▁a', '▁', 't', 'e', 'st'], -2.87727689743042), +(['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'est'], -3.6284031867980957)] + >>> sp.decode([284, 47, 11, 4, 15, 400]) 'This is a test' + >>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]) ['This is a test', 'Hello world'] + +>>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto') +>>> proto.text +'This is a test' + >>> sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st']) 'This is a test' + >>> sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]) ['This is a test', 'Hello world'] + >>> sp.get_piece_size() 1000 + >>> sp.id_to_piece(2) '' + >>> sp.id_to_piece([2, 3, 4]) ['', '\r', '▁'] + >>> sp.piece_to_id('') 1 + >>> sp.piece_to_id(['', '\r', '▁']) [2, 3, 4] + >>> len(sp) 1000 + >>> sp[''] 2 ``` @@ -116,98 +181,3 @@ with urllib.request.urlopen( sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) print(sp.encode('this is test')) ``` - - -### Segmentation (old interface) -``` -% python ->>> import sentencepiece as spm ->>> sp = spm.SentencePieceProcessor() ->>> sp.Load("test/test_model.model") -True ->>> sp.EncodeAsPieces("This is a test") -['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'] ->>> sp.EncodeAsIds("This is a test") -[284, 47, 11, 4, 15, 400] ->>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']) -'This is a test' ->>> sp.NBestEncodeAsPieces("This is a test", 5) -[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']] ->>> for x in range(10): -... sp.SampleEncodeAsPieces("This is a test", -1, 0.1) -... -['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] -['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est'] -['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] -['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] -['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] -['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] -['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] -['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] -['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] -['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't'] ->>> sp.DecodeIds([284, 47, 11, 4, 15, 400]) -'This is a test' ->>> sp.GetPieceSize() -1000 ->>> sp.IdToPiece(2) -'' ->>> sp.PieceToId('') -2 ->>> len(sp) -1000 ->>> sp[''] -2 -``` - -### Model Training (old interface) -Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.Train() function. - -``` ->>> import sentencepiece as spm ->>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000') -unigram_model_trainer.cc(494) LOG(INFO) Starts training with : -input: "test/botchan.txt" -model_prefix: "m" -model_type: UNIGRAM -..snip.. -unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623 -unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623 -unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118 -unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091 -trainer_interface.cc(284) LOG(INFO) Saving model: m.model -trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab ->>> -``` - -## Python2/3 String/Unicode compatibility -Sentencepiece python wrapper accepts both Unicode string and legacy byte string. -The output string type is determined by the input string type. -The output type of IdToPiece/DecodeIds methods is *str*, but note that it is a legacy byte string in Python2 and Unicode string in Python3 respectively. - -* Python2: -``` ->>> sp.EncodeAsPieces('吾輩は猫である') -['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] ->>> sp.EncodeAsPieces(u'吾輩は猫である') -[u'\u2581', u'\u543e', u'\u8f29', u'\u306f', u'\u732b', u'\u3067\u3042\u308b'] ->>> sp.EncodeAsPieces(u'吾輩は猫である'.encode('utf-8')) -['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] ->>> sp.IdToPiece(10) -'\xe3\x81\xab' ->>> type(sp.IdToPiece(10)) - -``` - -* Python3: -``` ->>> sp.EncodeAsPieces('吾輩は猫である') -['▁', '吾', '輩', 'は', '猫', 'である'] ->>> sp.EncodeAsPieces('吾輩は猫である'.encode('utf-8')) -[b'\xe2\x96\x81', b'\xe5\x90\xbe', b'\xe8\xbc\xa9', b'\xe3\x81\xaf', b'\xe7\x8c\xab', b'\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] ->>> ->>> sp.IdToPiece(10) -'に' ->>> type(sp.IdToPiece(10)) - -``` diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py index cf06830..911a2cb 100644 --- a/python/src/sentencepiece/__init__.py +++ b/python/src/sentencepiece/__init__.py @@ -635,7 +635,7 @@ class SentencePieceProcessor(object): return _encode(input) - def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): return self.NBestEncode(input=input, nbest_size=nbest_size, out_type=str, **kwargs) @@ -732,6 +732,26 @@ class SentencePieceProcessor(object): return _encode(input) + def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type=str, **kwargs) + + + def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type=int, **kwargs) + + + def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type='serialized_proto', **kwargs) + + + def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type='immutable_proto', **kwargs) + + def Decode(self, input, out_type=str, num_threads=None): """Decode processed id or token sequences. diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i index 2ac68a8..fc773e2 100644 --- a/python/src/sentencepiece/sentencepiece.i +++ b/python/src/sentencepiece/sentencepiece.i @@ -903,7 +903,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return _encode(input) - def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): return self.NBestEncode(input=input, nbest_size=nbest_size, out_type=str, **kwargs) @@ -1000,6 +1000,26 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return _encode(input) + def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type=str, **kwargs) + + + def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type=int, **kwargs) + + + def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type='serialized_proto', **kwargs) + + + def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs): + return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, + out_type='immutable_proto', **kwargs) + + def Decode(self, input, out_type=str, num_threads=None): """Decode processed id or token sequences. diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index 92327ac..2b9ad28 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -566,7 +566,7 @@ class TestSentencepieceProcessor(unittest.TestCase): for n in sp.decode(results): self.assertEqual(n, text) - # batch test + # batch test results = sp.nbest_encode([text, text2], nbest_size=10, out_type=out_type) self.assertEqual( results, @@ -589,6 +589,19 @@ class TestSentencepieceProcessor(unittest.TestCase): for n in decoded: self.assertEqual(n, text2) + self.assertEqual( + sp.nbest_encode(text, nbest_size=10, out_type=str), + sp.nbest_encode_as_pieces(text, nbest_size=10)) + self.assertEqual( + sp.nbest_encode(text, nbest_size=10, out_type=int), + sp.nbest_encode_as_ids(text, nbest_size=10)) + self.assertEqual( + sp.nbest_encode(text, nbest_size=10, out_type='serialized_proto'), + sp.nbest_encode_as_serialized_proto(text, nbest_size=10)) + self.assertEqual( + sp.nbest_encode(text, nbest_size=10, out_type='immutable_proto'), + sp.nbest_encode_as_immutable_proto(text, nbest_size=10)) + def test_sample_and_score(self): sp = self.sp_ text = 'hello world' @@ -618,6 +631,11 @@ class TestSentencepieceProcessor(unittest.TestCase): for n in results[1]: self.assertEqual(sp.decode(n[0]), text2) + sp.sample_encode_and_score_as_pieces(text, 10) + sp.sample_encode_and_score_as_ids(text, 10) + sp.sample_encode_and_score_as_immutable_proto(text, 10) + sp.sample_encode_and_score_as_serialized_proto(text, 10) + def test_valid_range(self): size = self.sp_.piece_size() funcs = [ -- 2.30.2