From: Taku Kudo Date: Mon, 1 Aug 2022 08:19:09 +0000 (+0900) Subject: Supports ImmutableSentencePieceText from python module X-Git-Tag: archive/raspbian/0.1.97-3+rpi1^2~16 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=b357088e0fe99d513bcdb768d249789cebc0b0d0;p=sentencepiece.git Supports ImmutableSentencePieceText from python module Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0012-Supports-ImmutableSentencePieceText-from-python-modu.patch --- diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py index 1543d32..69a9825 100644 --- a/python/src/sentencepiece/__init__.py +++ b/python/src/sentencepiece/__init__.py @@ -61,6 +61,98 @@ class _SwigNonDynamicMeta(type): __setattr__ = _swig_setattr_nondynamic_class_variable(type.__setattr__) +class ImmutableSentencePieceText_ImmutableSentencePiece(object): + thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") + __repr__ = _swig_repr + + def __init__(self): + _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece()) + __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece + + def piece(self): + return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_piece(self) + + def surface(self): + return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_surface(self) + + def id(self): + return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_id(self) + + def begin(self): + return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_begin(self) + + def end(self): + return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_end(self) + +# Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece: +_sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) + +class ImmutableSentencePieceText(object): + thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") + __repr__ = _swig_repr + + def __init__(self): + _sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText()) + __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText + + def pieces_size(self): + return _sentencepiece.ImmutableSentencePieceText_pieces_size(self) + + def text(self): + return _sentencepiece.ImmutableSentencePieceText_text(self) + + def score(self): + return _sentencepiece.ImmutableSentencePieceText_score(self) + + def SerializeAsString(self): + return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self) + + def pieces(self, index): + return _sentencepiece.ImmutableSentencePieceText_pieces(self, index) + + def __len__(self): + return self.pieces_size() + + def __getitem__(self, i): + return self.pieces(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() + + +# Register ImmutableSentencePieceText in _sentencepiece: +_sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) + +class ImmutableNBestSentencePieceText(object): + thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") + __repr__ = _swig_repr + + def __init__(self): + _sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText()) + __swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText + + def nbests_size(self): + return _sentencepiece.ImmutableNBestSentencePieceText_nbests_size(self) + + def SerializeAsString(self): + return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self) + + def nbests(self, index): + return _sentencepiece.ImmutableNBestSentencePieceText_nbests(self, index) + + def __len__(self): + return self.nbests_size() + + def __getitem__(self, i): + return self.nbests(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() + + +# Register ImmutableNBestSentencePieceText in _sentencepiece: +_sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) + class SentencePieceProcessor(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr @@ -87,12 +179,6 @@ class SentencePieceProcessor(object): def LoadVocabulary(self, filename, threshold): return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold) - def SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best): - return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best) - - def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): - return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) - def CalculateEntropy(self, *args): return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args) @@ -147,6 +233,9 @@ class SentencePieceProcessor(object): def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + def _EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) @@ -156,6 +245,9 @@ class SentencePieceProcessor(object): def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + def _EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + def _DecodeIds(self, ids): return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids) @@ -168,6 +260,12 @@ class SentencePieceProcessor(object): def _DecodePiecesAsSerializedProto(self, pieces): return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) + def _DecodeIdsAsImmutableProto(self, ids): + return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids) + + def _DecodePiecesAsImmutableProto(self, pieces): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces) + def _DecodeIdsBatch(self, ins, num_threads): return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) @@ -180,6 +278,9 @@ class SentencePieceProcessor(object): def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads): return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) + def _DecodePiecesAsImmutableProtoBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads) + def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) @@ -189,17 +290,26 @@ class SentencePieceProcessor(object): def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) - def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + def _NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + + def _SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) - def _SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + def _SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) - def _CalculateEntropy(self, text, theta): - return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, theta) + def _SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) - def _CalculateEntropyBatch(self, ins, theta, num_threads): - return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, theta, num_threads) + def _SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + + def _CalculateEntropy(self, text, alpha): + return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, alpha) + + def _CalculateEntropyBatch(self, ins, alpha, num_threads): + return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads) def Init(self, model_file=None, @@ -319,9 +429,12 @@ class SentencePieceProcessor(object): if out_type is str: return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) if out_type is int: return self._EncodeAsIds(input, enable_sampling, nbest_size, @@ -329,9 +442,12 @@ class SentencePieceProcessor(object): if out_type is str: return self._EncodeAsPieces(input, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) raise RuntimeError('unknown out_type={}'.format(out_type)) return None @@ -346,7 +462,11 @@ class SentencePieceProcessor(object): def EncodeAsSerializedProto(self, input, **kwargs): - return self.Encode(input=input, out_type='proto', **kwargs) + return self.Encode(input=input, out_type='serialized_proto', **kwargs) + + + def EncodeAsImmutableProto(self, input, **kwargs): + return self.Encode(input=input, out_type='immutable_proto', **kwargs) def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): @@ -361,7 +481,12 @@ class SentencePieceProcessor(object): def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, - out_type='proto', enable_sampling=True, **kwargs) + out_type='serialized_proto', enable_sampling=True, **kwargs) + + + def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type='immutable_proto', enable_sampling=True, **kwargs) def NBestEncode(self, @@ -407,9 +532,12 @@ class SentencePieceProcessor(object): if out_type is str: return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._NBestEncodeAsSerializedProto(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._NBestEncodeAsImmutableProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: return [_encode(n) for n in input] @@ -429,7 +557,12 @@ class SentencePieceProcessor(object): def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): return self.NBestEncode(input=input, nbest_size=nbest_size, - out_type='proto', **kwargs) + out_type='serialized_proto', **kwargs) + + + def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type='immutable_proto', **kwargs) def SampleEncodeAndScore(self, @@ -440,20 +573,20 @@ class SentencePieceProcessor(object): reverse=None, emit_unk_piece=None, num_samples=None, - theta=None, + alpha=None, wor=None, include_best=None): """SampleEncodeAndScore text input to segmented ids or tokens. Args: input: input string. accepsts list of string. - out_type: output type. int or str or 'proto'. + out_type: output type. int or str or 'serialized_proto' or 'immutable_proto' add_bos: Add to the result (Default = false) add_eos: Add to the result (Default = false) / is added after reversing (if enabled). reverse: Reverses the tokenized sequence (Default = false) emit_unk_piece: Emits the unk literal string (Default = false) num_samples: How many samples to return (Default = 1) - theta: inverse temperature for sampling + alpha: inverse temperature for sampling wor: whether to sample without replacement (Default = false) include_best: whether to include the best tokenization, requires wor=True (Default = false) """ @@ -470,8 +603,8 @@ class SentencePieceProcessor(object): emit_unk_piece = self._emit_unk_piece if num_samples is None: num_samples = 1 - if theta is None: - theta = 1. + if alpha is None: + alpha = 1. if wor is None: wor = False if include_best is None: @@ -486,10 +619,10 @@ class SentencePieceProcessor(object): def _encode(text): if out_type is int: - return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, + return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) else: - return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, + return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: @@ -502,7 +635,7 @@ class SentencePieceProcessor(object): """Decode processed id or token sequences. Args: - out_type: output type. str or 'proto' (Default = str) + out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) num_threads: the number of threads used in the batch processin (Default = 1). """ @@ -533,7 +666,7 @@ class SentencePieceProcessor(object): if type(input[0][0]) is str: return self._DecodePiecesBatch(input, num_threads) - if out_type == 'proto': + if out_type == 'serialized_proto': if type(input) is int: return self._DecodeIdsAsSerializedProto([input]) if type(input) is str: @@ -552,6 +685,25 @@ class SentencePieceProcessor(object): return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + if out_type == 'immutable_proto': + if type(input) is int: + return self._DecodeIdsAsImmutableProto([input]) + if type(input) is str: + return self._DecodePiecesAsImmutableProto([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIdsAsImmutableProto(input) + if type(input[0]) is str: + return self._DecodePiecesAsImmutableProto(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsAsImmutableProtoBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesAsImmutableProtoBatch(input, num_threads) + + raise RuntimeError('unknown output or input type') return None @@ -564,24 +716,32 @@ class SentencePieceProcessor(object): return self.Decode(input=input, out_type=out_type, **kwargs) - def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): + def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): return self.Decode(input=input, out_type=out_type, **kwargs) - def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): + def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): return self.Decode(input=input, out_type=out_type, **kwargs) - def CalculateEntropy(self, input, theta, num_threads=None): + def CalculateEntropy(self, input, alpha, num_threads=None): """Calculate sentence entropy""" if type(input) is list: if num_threads is None: num_threads = self._num_threads if num_threads is None or type(num_threads) is not int: raise RuntimeError('num_threads must be int') - return self._CalculateEntropyBatch(input, theta, num_threads) + return self._CalculateEntropyBatch(input, alpha, num_threads) - return self._CalculateEntropy(input, theta) + return self._CalculateEntropy(input, alpha) def piece_size(self): diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i index 40373ce..1e2e1e0 100644 --- a/python/src/sentencepiece/sentencepiece.i +++ b/python/src/sentencepiece/sentencepiece.i @@ -166,7 +166,17 @@ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, if (add_bos || add_eos || reverse || emit_unk_piece) { throw sentencepiece::util::Status( sentencepiece::util::StatusCode::kUnimplemented, - "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); + } +} + +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + sentencepiece::ImmutableSentencePieceText *proto, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); } } @@ -216,7 +226,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ std::vector outs(ins.size()); \ - InitNumThreads(ins, &num_threads); \ + InitNumThreads(ins, &num_threads); \ { \ ThreadPool pool(ins.size()); \ for (int n = 0; n < num_threads; ++n) { \ @@ -237,7 +247,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { #define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ std::vector outs(ins.size()); \ - InitNumThreads(ins, &num_threads); \ + InitNumThreads(ins, &num_threads); \ { \ ThreadPool pool(ins.size()); \ for (int n = 0; n < num_threads; ++n) { \ @@ -264,6 +274,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } } +%apply unsigned int { uint32_t } + %ignore sentencepiece::util::Status; %ignore sentencepiece::util::StatusCode; %ignore absl::string_view; @@ -272,32 +284,48 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { %ignore sentencepiece::NormalizerSpec; %ignore sentencepiece::TrainerSpec; %ignore sentencepiece::SentencePieceProcessor::status; +%ignore sentencepiece::ImmutableSentencePieceText::mutable_proto; +%ignore sentencepiece::ImmutableSentencePieceText::pieces() const; +%ignore sentencepiece::ImmutableNBestSentencePieceText::mutable_proto; +%ignore sentencepiece::ImmutableNBestSentencePieceText::nbests() const; %ignore sentencepiece::SentencePieceProcessor::Encode; +%ignore sentencepiece::SentencePieceProcessor::SampleEncode; +%ignore sentencepiece::SentencePieceProcessor::NBestEncode; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; +%ignore sentencepiece::SentencePieceProcessor::Decode; + %ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces; %ignore sentencepiece::SentencePieceProcessor::EncodeAsIds; -%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; -%ignore sentencepiece::SentencePieceProcessor::SampleEncode; %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds; %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces; -%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; -%ignore sentencepiece::SentencePieceProcessor::NBestEncode; -%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds; -%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; -%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; - -%ignore sentencepiece::SentencePieceProcessor::Decode; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsIds; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsPieces; %ignore sentencepiece::SentencePieceProcessor::DecodeIds; %ignore sentencepiece::SentencePieceProcessor::DecodePieces; + +%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto; +%ignore sentencepiece::SentencePieceProcessor::EncodeAsImmutableProto; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsImmutableProto; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsImmutableProto; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsImmutableProto; +%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsImmutableProto; +%ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsImmutableProto; + %ignore sentencepiece::SentencePieceProcessor::model_proto; %ignore sentencepiece::SentencePieceProcessor::Load; %ignore sentencepiece::SentencePieceProcessor::LoadOrDie; %ignore sentencepiece::pretokenizer::PretokenizerForTrainingInterface; %ignore sentencepiece::SentenceIterator; +%ignore sentencepiece::ConvertToUnicodeSpans; %ignore sentencepiece::SentencePieceTrainer::Train; %ignore sentencepiece::SentencePieceTrainer::GetNormalizerSpec; %ignore sentencepiece::SentencePieceTrainer::PopulateNormalizerSpec; @@ -351,6 +379,19 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return proto; } + sentencepiece::ImmutableSentencePieceText + _EncodeAsImmutableProto(absl::string_view text, + bool enable_sampling, + int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + auto proto = enable_sampling ? + $self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : + $self->EncodeAsImmutableProto(text); + RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } + ///////////////////////////////////////////////////////////////////////////// // EncodeAs* (Batch request) std::vector> _EncodeAsIdsBatch( @@ -381,6 +422,17 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { sentencepiece::util::bytes); } + std::vector + _EncodeAsImmutableProtoBatch( + const std::vector &ins, int num_threads, + bool enable_sampling, int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto, + absl::string_view, + sentencepiece::ImmutableSentencePieceText); + } + ///////////////////////////////////////////////////////////////////////////// // DecodeAs* (Single request) std::string _DecodeIds(const std::vector &ids) const { @@ -404,6 +456,18 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return $self->DecodePiecesAsSerializedProto(pieces); } + sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto( + const std::vector &ids) const { + CheckIds(ids, $self->GetPieceSize()); + return $self->DecodeIdsAsImmutableProto(ids); + } + + sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto( + const std::vector &pieces) const { + CheckIds(pieces, $self->GetPieceSize()); + return $self->DecodePiecesAsImmutableProto(pieces); + } + ///////////////////////////////////////////////////////////////////////////// // DecodeAs* (Batch request) std::vector _DecodeIdsBatch( @@ -428,6 +492,13 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { sentencepiece::util::bytes); } + std::vector + _DecodePiecesAsImmutableProtoBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string, + sentencepiece::ImmutableSentencePieceText); + } + //////////////////////////////////////////////////////////////////////////// // NBestEncodeAs* (Single request) std::vector> @@ -454,25 +525,37 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return piecess; } - sentencepiece::util::bytes _NBestEncodeAsSerializedProto(absl::string_view text, - int nbest_size, - bool add_bos, bool add_eos, bool reverse, - bool emit_unk_piece) const { + sentencepiece::util::bytes + _NBestEncodeAsSerializedProto(absl::string_view text, + int nbest_size, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { RewriteIds(*$self, static_cast(nullptr), add_bos, add_eos, reverse, emit_unk_piece); return $self->NBestEncodeAsSerializedProto(text, nbest_size); } + sentencepiece::ImmutableNBestSentencePieceText + _NBestEncodeAsImmutableProto(absl::string_view text, + int nbest_size, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return $self->NBestEncodeAsImmutableProto(text, nbest_size); + } + + ///////////////////////////////////////////////////////////////////////////// // SampleEncodeAndScoreAs* (Single request) std::vector, float>> _SampleEncodeAndScoreAsIds(absl::string_view text, - int num_samples, float theta, bool wor, + int num_samples, float alpha, bool wor, bool include_best, bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) const { auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples, - theta, wor, include_best); + alpha, wor, include_best); for (auto &ids : idss) { RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); } @@ -481,25 +564,50 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { std::vector, float>> _SampleEncodeAndScoreAsPieces(absl::string_view text, - int num_samples, float theta, bool wor, + int num_samples, float alpha, bool wor, bool include_best, bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) const { auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples, - theta, wor, include_best); + alpha, wor, include_best); for (auto &pieces : piecess) { RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; } + sentencepiece::util::bytes + _SampleEncodeAndScoreAsSerializedProto(absl::string_view text, + int num_samples, float alpha, bool wor, + bool include_best, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return $self->SampleEncodeAndScoreAsSerializedProto(text, num_samples, + alpha, wor, include_best); + } + + sentencepiece::ImmutableNBestSentencePieceText + _SampleEncodeAndScoreAsImmutableProto(absl::string_view text, + int num_samples, float alpha, bool wor, + bool include_best, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, + alpha, wor, include_best); + } + + // Calculate Entropy - float _CalculateEntropy(absl::string_view text, float theta) { - return $self->CalculateEntropy(text, theta); + float _CalculateEntropy(absl::string_view text, float alpha) { + return $self->CalculateEntropy(text, alpha); } std::vector _CalculateEntropyBatch(const std::vector &ins, - float theta, int num_threads) { + float alpha, int num_threads) { std::vector outs(ins.size()); InitNumThreads(ins, &num_threads); { @@ -507,7 +615,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (int n = 0; n < num_threads; ++n) { pool.Schedule([&, n]() { for (size_t i = n; i < ins.size(); i += num_threads) { - outs[i] = self->CalculateEntropy(ins[i], theta); + outs[i] = self->CalculateEntropy(ins[i], alpha); } }); } @@ -634,9 +742,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { if out_type is str: return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) if out_type is int: return self._EncodeAsIds(input, enable_sampling, nbest_size, @@ -644,9 +755,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { if out_type is str: return self._EncodeAsPieces(input, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) raise RuntimeError('unknown out_type={}'.format(out_type)) return None @@ -661,7 +775,11 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { def EncodeAsSerializedProto(self, input, **kwargs): - return self.Encode(input=input, out_type='proto', **kwargs) + return self.Encode(input=input, out_type='serialized_proto', **kwargs) + + + def EncodeAsImmutableProto(self, input, **kwargs): + return self.Encode(input=input, out_type='immutable_proto', **kwargs) def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): @@ -676,7 +794,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, - out_type='proto', enable_sampling=True, **kwargs) + out_type='serialized_proto', enable_sampling=True, **kwargs) + + + def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type='immutable_proto', enable_sampling=True, **kwargs) def NBestEncode(self, @@ -722,9 +845,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { if out_type is str: return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) - if out_type == 'proto': + if out_type == 'serialized_proto' or out_type == 'proto': return self._NBestEncodeAsSerializedProto(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'immutable_proto': + return self._NBestEncodeAsImmutableProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: return [_encode(n) for n in input] @@ -744,7 +870,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): return self.NBestEncode(input=input, nbest_size=nbest_size, - out_type='proto', **kwargs) + out_type='serialized_proto', **kwargs) + + + def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type='immutable_proto', **kwargs) def SampleEncodeAndScore(self, @@ -755,20 +886,20 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { reverse=None, emit_unk_piece=None, num_samples=None, - theta=None, + alpha=None, wor=None, include_best=None): """SampleEncodeAndScore text input to segmented ids or tokens. Args: input: input string. accepsts list of string. - out_type: output type. int or str or 'proto'. + out_type: output type. int or str or 'serialized_proto' or 'immutable_proto' add_bos: Add to the result (Default = false) add_eos: Add to the result (Default = false) / is added after reversing (if enabled). reverse: Reverses the tokenized sequence (Default = false) emit_unk_piece: Emits the unk literal string (Default = false) num_samples: How many samples to return (Default = 1) - theta: inverse temperature for sampling + alpha: inverse temperature for sampling wor: whether to sample without replacement (Default = false) include_best: whether to include the best tokenization, requires wor=True (Default = false) """ @@ -785,8 +916,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { emit_unk_piece = self._emit_unk_piece if num_samples is None: num_samples = 1 - if theta is None: - theta = 1. + if alpha is None: + alpha = 1. if wor is None: wor = False if include_best is None: @@ -801,10 +932,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { def _encode(text): if out_type is int: - return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, + return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) else: - return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, + return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: @@ -817,7 +948,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { """Decode processed id or token sequences. Args: - out_type: output type. str or 'proto' (Default = str) + out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) num_threads: the number of threads used in the batch processin (Default = 1). """ @@ -848,7 +979,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { if type(input[0][0]) is str: return self._DecodePiecesBatch(input, num_threads) - if out_type == 'proto': + if out_type == 'serialized_proto': if type(input) is int: return self._DecodeIdsAsSerializedProto([input]) if type(input) is str: @@ -867,6 +998,25 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + if out_type == 'immutable_proto': + if type(input) is int: + return self._DecodeIdsAsImmutableProto([input]) + if type(input) is str: + return self._DecodePiecesAsImmutableProto([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIdsAsImmutableProto(input) + if type(input[0]) is str: + return self._DecodePiecesAsImmutableProto(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsAsImmutableProtoBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesAsImmutableProtoBatch(input, num_threads) + + raise RuntimeError('unknown output or input type') return None @@ -879,24 +1029,32 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return self.Decode(input=input, out_type=out_type, **kwargs) - def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): + def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): return self.Decode(input=input, out_type=out_type, **kwargs) - def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): + def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): return self.Decode(input=input, out_type=out_type, **kwargs) - def CalculateEntropy(self, input, theta, num_threads=None): + def CalculateEntropy(self, input, alpha, num_threads=None): """Calculate sentence entropy""" if type(input) is list: if num_threads is None: num_threads = self._num_threads if num_threads is None or type(num_threads) is not int: raise RuntimeError('num_threads must be int') - return self._CalculateEntropyBatch(input, theta, num_threads) + return self._CalculateEntropyBatch(input, alpha, num_threads) - return self._CalculateEntropy(input, theta) + return self._CalculateEntropy(input, alpha) def piece_size(self): @@ -1028,6 +1186,50 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } } +%extend sentencepiece::ImmutableSentencePieceText { + ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const { + if (index < 0 || index >= static_cast($self->pieces_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "piece index is out of range."); + } + return $self->pieces(index); + } + +%pythoncode { + def __len__(self): + return self.pieces_size() + + def __getitem__(self, i): + return self.pieces(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() +} +} + +%extend sentencepiece::ImmutableNBestSentencePieceText { + ImmutableSentencePieceText nbests(int index) const { + if (index < 0 || index >= static_cast($self->nbests_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "nbest index is out of range."); + } + return $self->nbests(index); + } + +%pythoncode { + def __len__(self): + return self.nbests_size() + + def __getitem__(self, i): + return self.nbests(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() +} +} + %typemap(out) std::vector { $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { @@ -1277,6 +1479,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } } +%typemap(out) std::vector { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { + PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText($1.at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); + PyList_SET_ITEM($result, i, obj); + } +} + %typemap(in) sentencepiece::SentenceIterator * { sentencepiece::SentenceIterator *out = nullptr; if (PyIter_Check($input)) { @@ -1324,6 +1534,18 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { delete $1; } +%typemap(freearg) sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece { + delete $1; +} + +%typemap(freearg) sentencepiece::ImmutableSentencePieceText { + delete $1; +} + +%typemap(freearg) sentencepiece::ImmutableNBestSentencePieceText { + delete $1; +} + %include %include diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx index 36ce38c..9776b0f 100644 --- a/python/src/sentencepiece/sentencepiece_wrap.cxx +++ b/python/src/sentencepiece/sentencepiece_wrap.cxx @@ -2694,17 +2694,20 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO #define SWIGTYPE_p_char swig_types[0] #define SWIGTYPE_p_float swig_types[1] -#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[2] -#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[3] -#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[4] -#define SWIGTYPE_p_std__string swig_types[5] -#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[6] -#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[7] -#define SWIGTYPE_p_std__vectorT_int_t swig_types[8] -#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[9] -#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[10] -static swig_type_info *swig_types[12]; -static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; +#define SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText swig_types[2] +#define SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText swig_types[3] +#define SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece swig_types[4] +#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[5] +#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[6] +#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[7] +#define SWIGTYPE_p_std__string swig_types[8] +#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[9] +#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[10] +#define SWIGTYPE_p_std__vectorT_int_t swig_types[11] +#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[12] +#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[13] +static swig_type_info *swig_types[15]; +static swig_module_info swig_module = {swig_types, 14, 0, 0, 0, 0}; #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) @@ -2972,7 +2975,17 @@ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, if (add_bos || add_eos || reverse || emit_unk_piece) { throw sentencepiece::util::Status( sentencepiece::util::StatusCode::kUnimplemented, - "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); + } +} + +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + sentencepiece::ImmutableSentencePieceText *proto, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); } } @@ -3022,7 +3035,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ std::vector outs(ins.size()); \ - InitNumThreads(ins, &num_threads); \ + InitNumThreads(ins, &num_threads); \ { \ ThreadPool pool(ins.size()); \ for (int n = 0; n < num_threads; ++n) { \ @@ -3043,7 +3056,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { #define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ std::vector outs(ins.size()); \ - InitNumThreads(ins, &num_threads); \ + InitNumThreads(ins, &num_threads); \ { \ ThreadPool pool(ins.size()); \ for (int n = 0; n < num_threads; ++n) { \ @@ -3060,131 +3073,24 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } // namespace -SWIGINTERN swig_type_info* -SWIG_pchar_descriptor(void) +SWIGINTERNINLINE PyObject* + SWIG_From_unsigned_SS_int (unsigned int value) { - static int init = 0; - static swig_type_info* info = 0; - if (!init) { - info = SWIG_TypeQuery("_p_char"); - init = 1; - } - return info; + return PyInt_FromSize_t((size_t) value); } -SWIGINTERN int -SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) -{ -#if PY_VERSION_HEX>=0x03000000 -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - if (PyBytes_Check(obj)) -#else - if (PyUnicode_Check(obj)) -#endif -#else - if (PyString_Check(obj)) -#endif - { - char *cstr; Py_ssize_t len; - int ret = SWIG_OK; -#if PY_VERSION_HEX>=0x03000000 -#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - if (!alloc && cptr) { - /* We can't allow converting without allocation, since the internal - representation of string in Python 3 is UCS-2/UCS-4 but we require - a UTF-8 representation. - TODO(bhy) More detailed explanation */ - return SWIG_RuntimeError; - } - obj = PyUnicode_AsUTF8String(obj); - if (!obj) - return SWIG_TypeError; - if (alloc) - *alloc = SWIG_NEWOBJ; -#endif - if (PyBytes_AsStringAndSize(obj, &cstr, &len) == -1) - return SWIG_TypeError; -#else - if (PyString_AsStringAndSize(obj, &cstr, &len) == -1) - return SWIG_TypeError; -#endif - if (cptr) { - if (alloc) { - if (*alloc == SWIG_NEWOBJ) { - *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); - *alloc = SWIG_NEWOBJ; - } else { - *cptr = cstr; - *alloc = SWIG_OLDOBJ; - } - } else { -#if PY_VERSION_HEX>=0x03000000 -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - *cptr = PyBytes_AsString(obj); -#else - assert(0); /* Should never reach here with Unicode strings in Python 3 */ -#endif -#else - *cptr = SWIG_Python_str_AsChar(obj); - if (!*cptr) - ret = SWIG_TypeError; -#endif - } - } - if (psize) *psize = len + 1; -#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - Py_XDECREF(obj); -#endif - return ret; - } else { -#if defined(SWIG_PYTHON_2_UNICODE) -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) -#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" -#endif -#if PY_VERSION_HEX<0x03000000 - if (PyUnicode_Check(obj)) { - char *cstr; Py_ssize_t len; - if (!alloc && cptr) { - return SWIG_RuntimeError; - } - obj = PyUnicode_AsUTF8String(obj); - if (!obj) - return SWIG_TypeError; - if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { - if (cptr) { - if (alloc) *alloc = SWIG_NEWOBJ; - *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); - } - if (psize) *psize = len + 1; + #define SWIG_From_long PyInt_FromLong - Py_XDECREF(obj); - return SWIG_OK; - } else { - Py_XDECREF(obj); - } - } -#endif -#endif - swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); - if (pchar_descriptor) { - void* vptr = 0; - if (SWIG_ConvertPtr(obj, &vptr, pchar_descriptor, 0) == SWIG_OK) { - if (cptr) *cptr = (char *) vptr; - if (psize) *psize = vptr ? (strlen((char *)vptr) + 1) : 0; - if (alloc) *alloc = SWIG_OLDOBJ; - return SWIG_OK; - } - } - } - return SWIG_TypeError; +SWIGINTERNINLINE PyObject* +SWIG_From_unsigned_SS_long (unsigned long value) +{ + return (value > LONG_MAX) ? + PyLong_FromUnsignedLong(value) : PyInt_FromLong(static_cast< long >(value)); } - - - #include #if !defined(SWIG_NO_LLONG_MAX) # if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__) @@ -3195,6 +3101,47 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) #endif +#if defined(LLONG_MAX) && !defined(SWIG_LONG_LONG_AVAILABLE) +# define SWIG_LONG_LONG_AVAILABLE +#endif + + +#ifdef SWIG_LONG_LONG_AVAILABLE +SWIGINTERNINLINE PyObject* +SWIG_From_unsigned_SS_long_SS_long (unsigned long long value) +{ + return (value > LONG_MAX) ? + PyLong_FromUnsignedLongLong(value) : PyInt_FromLong(static_cast< long >(value)); +} +#endif + + +SWIGINTERNINLINE PyObject * +SWIG_From_size_t (size_t value) +{ +#ifdef SWIG_LONG_LONG_AVAILABLE + if (sizeof(size_t) <= sizeof(unsigned long)) { +#endif + return SWIG_From_unsigned_SS_long (static_cast< unsigned long >(value)); +#ifdef SWIG_LONG_LONG_AVAILABLE + } else { + /* assume sizeof(size_t) <= sizeof(unsigned long long) */ + return SWIG_From_unsigned_SS_long_SS_long (static_cast< unsigned long long >(value)); + } +#endif +} + + + #define SWIG_From_double PyFloat_FromDouble + + +SWIGINTERNINLINE PyObject * +SWIG_From_float (float value) +{ + return SWIG_From_double (value); +} + + SWIGINTERN int SWIG_AsVal_double (PyObject *obj, double *val) { @@ -3335,98 +3282,215 @@ SWIG_AsVal_int (PyObject * obj, int *val) return res; } - -/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */ -#ifndef SWIG_isfinite -/* isfinite() is a macro for C99 */ -# if defined(isfinite) -# define SWIG_isfinite(X) (isfinite(X)) -# elif defined(__cplusplus) && __cplusplus >= 201103L -/* Use a template so that this works whether isfinite() is std::isfinite() or - * in the global namespace. The reality seems to vary between compiler - * versions. - * - * Make sure namespace std exists to avoid compiler warnings. - * - * extern "C++" is required as this fragment can end up inside an extern "C" { } block - */ -namespace std { } -extern "C++" template -inline int SWIG_isfinite_func(T x) { - using namespace std; - return isfinite(x); -} -# define SWIG_isfinite(X) (SWIG_isfinite_func(X)) -# elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -# define SWIG_isfinite(X) (__builtin_isfinite(X)) -# elif defined(__clang__) && defined(__has_builtin) -# if __has_builtin(__builtin_isfinite) -# define SWIG_isfinite(X) (__builtin_isfinite(X)) -# endif -# elif defined(_MSC_VER) -# define SWIG_isfinite(X) (_finite(X)) -# elif defined(__sun) && defined(__SVR4) -# include -# define SWIG_isfinite(X) (finite(X)) -# endif -#endif - - -/* Accept infinite as a valid float value unless we are unable to check if a value is finite */ -#ifdef SWIG_isfinite -# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX) && SWIG_isfinite(X)) -#else -# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX)) -#endif - - -SWIGINTERN int -SWIG_AsVal_float (PyObject * obj, float *val) -{ - double v; - int res = SWIG_AsVal_double (obj, &v); - if (SWIG_IsOK(res)) { - if (SWIG_Float_Overflow_Check(v)) { - return SWIG_OverflowError; - } else { - if (val) *val = static_cast< float >(v); +SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sentencepiece_ImmutableSentencePieceText_pieces(sentencepiece::ImmutableSentencePieceText const *self,int index){ + if (index < 0 || index >= static_cast(self->pieces_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "piece index is out of range."); } - } - return res; -} - + return self->pieces(index); + } +SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_ImmutableNBestSentencePieceText_nbests(sentencepiece::ImmutableNBestSentencePieceText const *self,int index){ + if (index < 0 || index >= static_cast(self->nbests_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "nbest index is out of range."); + } + return self->nbests(index); + } -SWIGINTERN int -SWIG_AsVal_bool (PyObject *obj, bool *val) +SWIGINTERN swig_type_info* +SWIG_pchar_descriptor(void) { - int r; - if (!PyBool_Check(obj)) - return SWIG_ERROR; - r = PyObject_IsTrue(obj); - if (r == -1) - return SWIG_ERROR; - if (val) *val = r ? true : false; - return SWIG_OK; -} - - - #define SWIG_From_double PyFloat_FromDouble - - -SWIGINTERNINLINE PyObject * -SWIG_From_float (float value) -{ - return SWIG_From_double (value); + static int init = 0; + static swig_type_info* info = 0; + if (!init) { + info = SWIG_TypeQuery("_p_char"); + init = 1; + } + return info; } -SWIGINTERNINLINE PyObject* - SWIG_From_int (int value) +SWIGINTERN int +SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) { - return PyInt_FromLong((long) value); -} - - +#if PY_VERSION_HEX>=0x03000000 +#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + if (PyBytes_Check(obj)) +#else + if (PyUnicode_Check(obj)) +#endif +#else + if (PyString_Check(obj)) +#endif + { + char *cstr; Py_ssize_t len; + int ret = SWIG_OK; +#if PY_VERSION_HEX>=0x03000000 +#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + if (!alloc && cptr) { + /* We can't allow converting without allocation, since the internal + representation of string in Python 3 is UCS-2/UCS-4 but we require + a UTF-8 representation. + TODO(bhy) More detailed explanation */ + return SWIG_RuntimeError; + } + obj = PyUnicode_AsUTF8String(obj); + if (!obj) + return SWIG_TypeError; + if (alloc) + *alloc = SWIG_NEWOBJ; +#endif + if (PyBytes_AsStringAndSize(obj, &cstr, &len) == -1) + return SWIG_TypeError; +#else + if (PyString_AsStringAndSize(obj, &cstr, &len) == -1) + return SWIG_TypeError; +#endif + if (cptr) { + if (alloc) { + if (*alloc == SWIG_NEWOBJ) { + *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); + *alloc = SWIG_NEWOBJ; + } else { + *cptr = cstr; + *alloc = SWIG_OLDOBJ; + } + } else { +#if PY_VERSION_HEX>=0x03000000 +#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + *cptr = PyBytes_AsString(obj); +#else + assert(0); /* Should never reach here with Unicode strings in Python 3 */ +#endif +#else + *cptr = SWIG_Python_str_AsChar(obj); + if (!*cptr) + ret = SWIG_TypeError; +#endif + } + } + if (psize) *psize = len + 1; +#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + Py_XDECREF(obj); +#endif + return ret; + } else { +#if defined(SWIG_PYTHON_2_UNICODE) +#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" +#endif +#if PY_VERSION_HEX<0x03000000 + if (PyUnicode_Check(obj)) { + char *cstr; Py_ssize_t len; + if (!alloc && cptr) { + return SWIG_RuntimeError; + } + obj = PyUnicode_AsUTF8String(obj); + if (!obj) + return SWIG_TypeError; + if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { + if (cptr) { + if (alloc) *alloc = SWIG_NEWOBJ; + *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); + } + if (psize) *psize = len + 1; + + Py_XDECREF(obj); + return SWIG_OK; + } else { + Py_XDECREF(obj); + } + } +#endif +#endif + + swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); + if (pchar_descriptor) { + void* vptr = 0; + if (SWIG_ConvertPtr(obj, &vptr, pchar_descriptor, 0) == SWIG_OK) { + if (cptr) *cptr = (char *) vptr; + if (psize) *psize = vptr ? (strlen((char *)vptr) + 1) : 0; + if (alloc) *alloc = SWIG_OLDOBJ; + return SWIG_OK; + } + } + } + return SWIG_TypeError; +} + + + + + +/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */ +#ifndef SWIG_isfinite +/* isfinite() is a macro for C99 */ +# if defined(isfinite) +# define SWIG_isfinite(X) (isfinite(X)) +# elif defined(__cplusplus) && __cplusplus >= 201103L +/* Use a template so that this works whether isfinite() is std::isfinite() or + * in the global namespace. The reality seems to vary between compiler + * versions. + * + * Make sure namespace std exists to avoid compiler warnings. + * + * extern "C++" is required as this fragment can end up inside an extern "C" { } block + */ +namespace std { } +extern "C++" template +inline int SWIG_isfinite_func(T x) { + using namespace std; + return isfinite(x); +} +# define SWIG_isfinite(X) (SWIG_isfinite_func(X)) +# elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +# define SWIG_isfinite(X) (__builtin_isfinite(X)) +# elif defined(__clang__) && defined(__has_builtin) +# if __has_builtin(__builtin_isfinite) +# define SWIG_isfinite(X) (__builtin_isfinite(X)) +# endif +# elif defined(_MSC_VER) +# define SWIG_isfinite(X) (_finite(X)) +# elif defined(__sun) && defined(__SVR4) +# include +# define SWIG_isfinite(X) (finite(X)) +# endif +#endif + + +/* Accept infinite as a valid float value unless we are unable to check if a value is finite */ +#ifdef SWIG_isfinite +# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX) && SWIG_isfinite(X)) +#else +# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX)) +#endif + + +SWIGINTERN int +SWIG_AsVal_float (PyObject * obj, float *val) +{ + double v; + int res = SWIG_AsVal_double (obj, &v); + if (SWIG_IsOK(res)) { + if (SWIG_Float_Overflow_Check(v)) { + return SWIG_OverflowError; + } else { + if (val) *val = static_cast< float >(v); + } + } + return res; +} + + +SWIGINTERNINLINE PyObject* + SWIG_From_int (int value) +{ + return PyInt_FromLong((long) value); +} + + SWIGINTERNINLINE PyObject* SWIG_From_bool (bool value) { @@ -3436,6 +3500,20 @@ SWIGINTERNINLINE PyObject* SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){ return self->Load(arg); } + +SWIGINTERN int +SWIG_AsVal_bool (PyObject *obj, bool *val) +{ + int r; + if (!PyBool_Check(obj)) + return SWIG_ERROR; + r = PyObject_IsTrue(obj); + if (r == -1) + return SWIG_ERROR; + if (val) *val = r ? true : false; + return SWIG_OK; +} + SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto ids = enable_sampling ? self->SampleEncodeAsIds(text, nbest_size, alpha) : @@ -3457,6 +3535,13 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Enco RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); return proto; } +SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__EncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto proto = enable_sampling ? + self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : + self->EncodeAsImmutableProto(text); + RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, absl::string_view, std::vector); @@ -3470,6 +3555,11 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__EncodeAsSerializedPr absl::string_view, sentencepiece::util::bytes); } +SWIGINTERN std::vector< sentencepiece::ImmutableSentencePieceText > sentencepiece_SentencePieceProcessor__EncodeAsImmutableProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto, + absl::string_view, + sentencepiece::ImmutableSentencePieceText); + } SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ CheckIds(ids, self->GetPieceSize()); return self->DecodeIds(ids); @@ -3485,6 +3575,14 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Deco CheckIds(pieces, self->GetPieceSize()); return self->DecodePiecesAsSerializedProto(pieces); } +SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIdsAsImmutableProto(ids); + } +SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ + CheckIds(pieces, self->GetPieceSize()); + return self->DecodePiecesAsImmutableProto(pieces); + } SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); } @@ -3499,6 +3597,10 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerial DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, sentencepiece::util::bytes); } +SWIGINTERN std::vector< sentencepiece::ImmutableSentencePieceText > sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string, + sentencepiece::ImmutableSentencePieceText); + } SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto idss = self->NBestEncodeAsIds(text, nbest_size); for (auto &ids : idss) { @@ -3518,26 +3620,43 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__NBes add_bos, add_eos, reverse, emit_unk_piece); return self->NBestEncodeAsSerializedProto(text, nbest_size); } -SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ +SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return self->NBestEncodeAsImmutableProto(text, nbest_size); + } +SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples, - theta, wor, include_best); + alpha, wor, include_best); for (auto &ids : idss) { RewriteIds(*self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); } return idss; } -SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ +SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto piecess = self->SampleEncodeAndScoreAsPieces(text, num_samples, - theta, wor, include_best); + alpha, wor, include_best); for (auto &pieces : piecess) { RewriteIds(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; } -SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float theta){ - return self->CalculateEntropy(text, theta); +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return self->SampleEncodeAndScoreAsSerializedProto(text, num_samples, + alpha, wor, include_best); + } +SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, + alpha, wor, include_best); + } +SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float alpha){ + return self->CalculateEntropy(text, alpha); } -SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float theta,int num_threads){ +SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float alpha,int num_threads){ std::vector outs(ins.size()); InitNumThreads(ins, &num_threads); { @@ -3545,7 +3664,7 @@ SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateE for (int n = 0; n < num_threads; ++n) { pool.Schedule([&, n]() { for (size_t i = n; i < ins.size(); i += num_threads) { - outs[i] = self->CalculateEntropy(ins[i], theta); + outs[i] = self->CalculateEntropy(ins[i], alpha); } }); } @@ -3596,56 +3715,672 @@ SWIG_AsVal_unsigned_SS_long (PyObject *obj, unsigned long *val) } } } -#endif - return SWIG_TypeError; +#endif + return SWIG_TypeError; +} + + +SWIGINTERN int +SWIG_AsVal_unsigned_SS_int (PyObject * obj, unsigned int *val) +{ + unsigned long v; + int res = SWIG_AsVal_unsigned_SS_long (obj, &v); + if (SWIG_IsOK(res)) { + if ((v > UINT_MAX)) { + return SWIG_OverflowError; + } else { + if (val) *val = static_cast< unsigned int >(v); + } + } + return res; +} + +SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromString(absl::string_view arg){ + const auto _status = sentencepiece::SentencePieceTrainer::Train(arg); + if (!_status.ok()) throw _status; + return; + } +SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap(std::unordered_map< std::string,std::string > const &args){ + const auto _status = sentencepiece::SentencePieceTrainer::Train(args); + if (!_status.ok()) throw _status; + return; + } +SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap2(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ + const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter); + if (!_status.ok()) throw _status; + return; + } +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap3(std::unordered_map< std::string,std::string > const &args){ + sentencepiece::util::bytes model_proto; + const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto); + if (!_status.ok()) throw _status; + return model_proto; + } +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap4(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ + sentencepiece::util::bytes model_proto; + const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto); + if (!_status.ok()) throw _status; + return model_proto; + } +#ifdef __cplusplus +extern "C" { +#endif +SWIGINTERN PyObject *_wrap_new_ImmutableSentencePieceText_ImmutableSentencePiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *result = 0 ; + + if (!SWIG_Python_UnpackTuple(args, "new_ImmutableSentencePieceText_ImmutableSentencePiece", 0, 0, 0)) SWIG_fail; + { + try { + result = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *)new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_NEW | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_delete_ImmutableSentencePieceText_ImmutableSentencePiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_DISOWN | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableSentencePieceText_ImmutableSentencePiece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + delete arg1; + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + std::string *result = 0 ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_piece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + result = (std::string *) &((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->piece(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(*result, input_type); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + std::string *result = 0 ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_surface" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + result = (std::string *) &((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->surface(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(*result, input_type); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + uint32_t result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_id" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->id(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + uint32_t result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_begin" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->begin(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + uint32_t result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_end" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { + try { + result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->end(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; + SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_NewClientData(obj)); + return SWIG_Py_Void(); +} + +SWIGINTERN PyObject *ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + return SWIG_Python_InitShadowInstance(args); +} + +SWIGINTERN PyObject *_wrap_new_ImmutableSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *result = 0 ; + + if (!SWIG_Python_UnpackTuple(args, "new_ImmutableSentencePieceText", 0, 0, 0)) SWIG_fail; + { + try { + result = (sentencepiece::ImmutableSentencePieceText *)new sentencepiece::ImmutableSentencePieceText(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_NEW | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_delete_ImmutableSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_DISOWN | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableSentencePieceText" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { + try { + delete arg1; + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + size_t result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces_size" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { + try { + result = ((sentencepiece::ImmutableSentencePieceText const *)arg1)->pieces_size(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_size_t(static_cast< size_t >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_text(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + std::string *result = 0 ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_text" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { + try { + result = (std::string *) &((sentencepiece::ImmutableSentencePieceText const *)arg1)->text(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(*result, input_type); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_score(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + float result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_score" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { + try { + result = (float)((sentencepiece::ImmutableSentencePieceText const *)arg1)->score(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_float(static_cast< float >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_SerializeAsString(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + sentencepiece::util::bytes result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_SerializeAsString" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { + try { + result = ((sentencepiece::ImmutableSentencePieceText const *)arg1)->SerializeAsString(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = MakePyOutputBytes(result); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; + + if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText_pieces", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { + try { + result = sentencepiece_ImmutableSentencePieceText_pieces((sentencepiece::ImmutableSentencePieceText const *)arg1,arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(static_cast< const sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_OWN | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *ImmutableSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; + SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_NewClientData(obj)); + return SWIG_Py_Void(); +} + +SWIGINTERN PyObject *ImmutableSentencePieceText_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + return SWIG_Python_InitShadowInstance(args); +} + +SWIGINTERN PyObject *_wrap_new_ImmutableNBestSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *result = 0 ; + + if (!SWIG_Python_UnpackTuple(args, "new_ImmutableNBestSentencePieceText", 0, 0, 0)) SWIG_fail; + { + try { + result = (sentencepiece::ImmutableNBestSentencePieceText *)new sentencepiece::ImmutableNBestSentencePieceText(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_NEW | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_delete_ImmutableNBestSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_DISOWN | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableNBestSentencePieceText" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + { + try { + delete arg1; + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; } -SWIGINTERN int -SWIG_AsVal_unsigned_SS_int (PyObject * obj, unsigned int *val) -{ - unsigned long v; - int res = SWIG_AsVal_unsigned_SS_long (obj, &v); - if (SWIG_IsOK(res)) { - if ((v > UINT_MAX)) { - return SWIG_OverflowError; - } else { - if (val) *val = static_cast< unsigned int >(v); +SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + size_t result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests_size" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + { + try { + result = ((sentencepiece::ImmutableNBestSentencePieceText const *)arg1)->nbests_size(); + ReleaseResultObject(resultobj); } - } - return res; + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_size_t(static_cast< size_t >(result)); + return resultobj; +fail: + return NULL; } -SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromString(absl::string_view arg){ - const auto _status = sentencepiece::SentencePieceTrainer::Train(arg); - if (!_status.ok()) throw _status; - return; + +SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_SerializeAsString(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[1] ; + sentencepiece::util::bytes result; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_SerializeAsString" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); } -SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap(std::unordered_map< std::string,std::string > const &args){ - const auto _status = sentencepiece::SentencePieceTrainer::Train(args); - if (!_status.ok()) throw _status; - return; + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + { + try { + result = ((sentencepiece::ImmutableNBestSentencePieceText const *)arg1)->SerializeAsString(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } } -SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap2(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ - const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter); - if (!_status.ok()) throw _status; - return; + { + resultobj = MakePyOutputBytes(result); } -SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap3(std::unordered_map< std::string,std::string > const &args){ - sentencepiece::util::bytes model_proto; - const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto); - if (!_status.ok()) throw _status; - return model_proto; + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText_nbests", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); } -SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap4(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ - sentencepiece::util::bytes model_proto; - const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto); - if (!_status.ok()) throw _status; - return model_proto; + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { + try { + result = sentencepiece_ImmutableNBestSentencePieceText_nbests((sentencepiece::ImmutableNBestSentencePieceText const *)arg1,arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } } -#ifdef __cplusplus -extern "C" { -#endif + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *ImmutableNBestSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; + SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_NewClientData(obj)); + return SWIG_Py_Void(); +} + +SWIGINTERN PyObject *ImmutableNBestSentencePieceText_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + return SWIG_Python_InitShadowInstance(args); +} + SWIGINTERN PyObject *_wrap_new_SentencePieceProcessor(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *result = 0 ; @@ -3992,165 +4727,16 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObj float *arg4 = (float *) 0 ; void *argp1 = 0 ; int res1 = 0 ; - float val3 ; - int ecode3 = 0 ; - void *argp4 = 0 ; - int res4 = 0 ; - sentencepiece::util::Status result; - - if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; - res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); - } - arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = ustring.str(); - } - ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); - } - arg3 = static_cast< float >(val3); - res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); - } - arg4 = reinterpret_cast< float * >(argp4); - { - try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); - ReleaseResultObject(resultobj); - } - catch (const sentencepiece::util::Status &status) { - SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); - } - } - { - if (!(&result)->ok()) { - SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); - } - resultobj = SWIG_From_bool((&result)->ok()); - } - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; - bool arg5 ; - bool arg6 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - bool val5 ; - int ecode5 = 0 ; - bool val6 ; - int ecode6 = 0 ; - PyObject *swig_obj[6] ; - std::vector< std::pair< std::vector< std::string >,float > > result; - - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; - res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); - } - arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = ustring.str(); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); - ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); - } - arg5 = static_cast< bool >(val5); - ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); - } - arg6 = static_cast< bool >(val6); - { - try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); - ReleaseResultObject(resultobj); - } - catch (const sentencepiece::util::Status &status) { - SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); - } - } - { - PyObject *input_type = resultobj; - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].first.size()); - for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); - } - PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); - } - } - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; - bool arg5 ; - bool arg6 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val3 ; + float val3 ; int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - bool val5 ; - int ecode5 = 0 ; - bool val6 ; - int ecode6 = 0 ; - PyObject *swig_obj[6] ; - std::vector< std::pair< std::vector< int >,float > > result; + void *argp4 = 0 ; + int res4 = 0 ; + sentencepiece::util::Status result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; + if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -4162,29 +4748,19 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb resultobj = ustring.input_type(); arg2 = ustring.str(); } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); - ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); - } - arg5 = static_cast< bool >(val5); - ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); } - arg6 = static_cast< bool >(val6); + arg3 = static_cast< float >(val3); + res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); + if (!SWIG_IsOK(res4)) { + SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); + } + arg4 = reinterpret_cast< float * >(argp4); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); + result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -4192,14 +4768,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb } } { - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].first.size()); - for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); - } - PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + if (!(&result)->ok()) { + SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); } + resultobj = SWIG_From_bool((&result)->ok()); } return resultobj; fail: @@ -5112,15 +5684,242 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObj } } { - resultobj = MakePyOutputBytes(result); + resultobj = MakePyOutputBytes(result); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + bool arg3 ; + int arg4 ; + float arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + void *argp1 = 0 ; + int res1 = 0 ; + bool val3 ; + int ecode3 = 0 ; + int val4 ; + int ecode4 = 0 ; + float val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + PyObject *swig_obj[9] ; + sentencepiece::ImmutableSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsImmutableProto", 9, 9, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "3"" of type '" "bool""'"); + } + arg3 = static_cast< bool >(val3); + ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "4"" of type '" "int""'"); + } + arg4 = static_cast< int >(val4); + ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "5"" of type '" "float""'"); + } + arg5 = static_cast< float >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + { + try { + result = sentencepiece_SentencePieceProcessor__EncodeAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; + int arg3 ; + bool arg4 ; + int arg5 ; + float arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; + int ecode3 = 0 ; + bool val4 ; + int ecode4 = 0 ; + int val5 ; + int ecode5 = 0 ; + float val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + std::vector< std::vector< int > > result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { + result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + } + PyList_SET_ITEM(resultobj, i, obj); + } + } + { + delete arg2; } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; std::vector< absl::string_view > *arg2 = 0 ; @@ -5151,12 +5950,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW bool val10 ; int ecode10 = 0 ; PyObject *swig_obj[10] ; - std::vector< std::vector< int > > result; + std::vector< std::vector< std::string > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5182,47 +5981,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); } arg3 = static_cast< int >(val3); ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); } arg4 = static_cast< bool >(val4); ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); } arg5 = static_cast< int >(val5); ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); } arg6 = static_cast< float >(val6); ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); } arg7 = static_cast< bool >(val7); ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); } arg8 = static_cast< bool >(val8); ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); } arg9 = static_cast< bool >(val9); ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); if (!SWIG_IsOK(ecode10)) { - SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); } arg10 = static_cast< bool >(val10); { try { - result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5230,11 +6029,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW } } { + PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); } PyList_SET_ITEM(resultobj, i, obj); } @@ -5251,7 +6051,7 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; std::vector< absl::string_view > *arg2 = 0 ; @@ -5282,12 +6082,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject bool val10 ; int ecode10 = 0 ; PyObject *swig_obj[10] ; - std::vector< std::vector< std::string > > result; + BytesArray result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5313,47 +6113,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); } arg3 = static_cast< int >(val3); ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); } arg4 = static_cast< bool >(val4); ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); } arg5 = static_cast< int >(val5); ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); } arg6 = static_cast< float >(val6); ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); } arg7 = static_cast< bool >(val7); ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); } arg8 = static_cast< bool >(val8); ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); } arg9 = static_cast< bool >(val9); ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); if (!SWIG_IsOK(ecode10)) { - SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); } arg10 = static_cast< bool >(val10); { try { - result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5361,14 +6161,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject } } { - PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].size()); - for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); - } - PyList_SET_ITEM(resultobj, i, obj); + PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); } } { @@ -5383,7 +6178,7 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsImmutableProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; std::vector< absl::string_view > *arg2 = 0 ; @@ -5414,12 +6209,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( bool val10 ; int ecode10 = 0 ; PyObject *swig_obj[10] ; - BytesArray result; + SwigValueWrapper< std::vector< sentencepiece::ImmutableSentencePieceText > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsImmutableProtoBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5445,47 +6240,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "3"" of type '" "int""'"); } arg3 = static_cast< int >(val3); ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "4"" of type '" "bool""'"); } arg4 = static_cast< bool >(val4); ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "5"" of type '" "int""'"); } arg5 = static_cast< int >(val5); ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "6"" of type '" "float""'"); } arg6 = static_cast< float >(val6); ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "7"" of type '" "bool""'"); } arg7 = static_cast< bool >(val7); ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "8"" of type '" "bool""'"); } arg8 = static_cast< bool >(val8); ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "9"" of type '" "bool""'"); } arg9 = static_cast< bool >(val9); ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); if (!SWIG_IsOK(ecode10)) { - SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "10"" of type '" "bool""'"); } arg10 = static_cast< bool >(val10); { try { - result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + result = sentencepiece_SentencePieceProcessor__EncodeAsImmutableProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5495,7 +6290,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText((&result)->at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); + PyList_SET_ITEM(resultobj, i, obj); } } { @@ -5750,6 +6546,121 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< int > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsImmutableProto", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyInt_Check(o)) { + (*out)[i] = static_cast(PyInt_AsLong(o)); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + arg2 = out; + } + { + try { + result = sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); + { + delete arg2; + } + return resultobj; +fail: + { + delete arg2; + } + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsImmutableProto", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + { + try { + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); + { + delete arg2; + } + return resultobj; +fail: + { + delete arg2; + } + return NULL; +} + + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; @@ -6043,7 +6954,82 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + } + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< std::vector< absl::string_view > > *arg2 = 0 ; + int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; + SwigValueWrapper< std::vector< sentencepiece::ImmutableSentencePieceText > > result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { + (*out)[i][j] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + { + try { + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -6053,7 +7039,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText((&result)->at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); + PyList_SET_ITEM(resultobj, i, obj); } } return resultobj; @@ -6323,6 +7310,86 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + int arg3 ; + bool arg4 ; + bool arg5 ; + bool arg6 ; + bool arg7 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; + int ecode3 = 0 ; + bool val4 ; + int ecode4 = 0 ; + bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + PyObject *swig_obj[7] ; + sentencepiece::ImmutableNBestSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsImmutableProto", 7, 7, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + { + try { + result = sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableNBestSentencePieceText(static_cast< const sentencepiece::ImmutableNBestSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_OWN | 0 ); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; @@ -6550,6 +7617,216 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + int arg3 ; + float arg4 ; + bool arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; + int ecode3 = 0 ; + float val4 ; + int ecode4 = 0 ; + bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + sentencepiece::util::bytes result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "4"" of type '" "float""'"); + } + arg4 = static_cast< float >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { + result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = MakePyOutputBytes(result); + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + int arg3 ; + float arg4 ; + bool arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; + int ecode3 = 0 ; + float val4 ; + int ecode4 = 0 ; + bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + sentencepiece::ImmutableNBestSentencePieceText result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "4"" of type '" "float""'"); + } + arg4 = static_cast< float >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { + result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableNBestSentencePieceText(static_cast< const sentencepiece::ImmutableNBestSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_OWN | 0 ); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; @@ -7009,6 +8286,31 @@ SWIGINTERN PyObject *SentencePieceTrainer_swigregister(PyObject *SWIGUNUSEDPARM( static PyMethodDef SwigMethods[] = { { "SWIG_PyInstanceMethod_New", SWIG_PyInstanceMethod_New, METH_O, NULL}, + { "new_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_new_ImmutableSentencePieceText_ImmutableSentencePiece, METH_NOARGS, NULL}, + { "delete_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_delete_ImmutableSentencePieceText_ImmutableSentencePiece, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_piece", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_surface", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_id", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_begin", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_end", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_swigregister", ImmutableSentencePieceText_ImmutableSentencePiece_swigregister, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_swiginit", ImmutableSentencePieceText_ImmutableSentencePiece_swiginit, METH_VARARGS, NULL}, + { "new_ImmutableSentencePieceText", _wrap_new_ImmutableSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableSentencePieceText", _wrap_delete_ImmutableSentencePieceText, METH_O, NULL}, + { "ImmutableSentencePieceText_pieces_size", _wrap_ImmutableSentencePieceText_pieces_size, METH_O, NULL}, + { "ImmutableSentencePieceText_text", _wrap_ImmutableSentencePieceText_text, METH_O, NULL}, + { "ImmutableSentencePieceText_score", _wrap_ImmutableSentencePieceText_score, METH_O, NULL}, + { "ImmutableSentencePieceText_SerializeAsString", _wrap_ImmutableSentencePieceText_SerializeAsString, METH_O, NULL}, + { "ImmutableSentencePieceText_pieces", _wrap_ImmutableSentencePieceText_pieces, METH_VARARGS, NULL}, + { "ImmutableSentencePieceText_swigregister", ImmutableSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableSentencePieceText_swiginit", ImmutableSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_ImmutableNBestSentencePieceText", _wrap_new_ImmutableNBestSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableNBestSentencePieceText", _wrap_delete_ImmutableNBestSentencePieceText, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_nbests_size", _wrap_ImmutableNBestSentencePieceText_nbests_size, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_nbests", _wrap_ImmutableNBestSentencePieceText_nbests, METH_VARARGS, NULL}, + { "ImmutableNBestSentencePieceText_swigregister", ImmutableNBestSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_swiginit", ImmutableNBestSentencePieceText_swiginit, METH_VARARGS, NULL}, { "new_SentencePieceProcessor", _wrap_new_SentencePieceProcessor, METH_NOARGS, NULL}, { "delete_SentencePieceProcessor", _wrap_delete_SentencePieceProcessor, METH_O, NULL}, { "SentencePieceProcessor_LoadFromSerializedProto", _wrap_SentencePieceProcessor_LoadFromSerializedProto, METH_VARARGS, NULL}, @@ -7017,8 +8319,6 @@ static PyMethodDef SwigMethods[] = { { "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL}, { "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL}, { "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL}, - { "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, - { "SentencePieceProcessor_SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL}, { "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL}, { "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL}, @@ -7037,22 +8337,30 @@ static PyMethodDef SwigMethods[] = { { "SentencePieceProcessor__EncodeAsIds", _wrap_SentencePieceProcessor__EncodeAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsPieces", _wrap_SentencePieceProcessor__EncodeAsPieces, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsSerializedProto", _wrap_SentencePieceProcessor__EncodeAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsImmutableProto", _wrap_SentencePieceProcessor__EncodeAsImmutableProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsIdsBatch", _wrap_SentencePieceProcessor__EncodeAsIdsBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsPiecesBatch", _wrap_SentencePieceProcessor__EncodeAsPiecesBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsSerializedProtoBatch", _wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsImmutableProtoBatch", _wrap_SentencePieceProcessor__EncodeAsImmutableProtoBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodeIds", _wrap_SentencePieceProcessor__DecodeIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodePieces", _wrap_SentencePieceProcessor__DecodePieces, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodeIdsAsSerializedProto", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsImmutableProto", _wrap_SentencePieceProcessor__DecodeIdsAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsImmutableProto", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodeIdsBatch", _wrap_SentencePieceProcessor__DecodeIdsBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodePiecesBatch", _wrap_SentencePieceProcessor__DecodePiecesBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__NBestEncodeAsIds", _wrap_SentencePieceProcessor__NBestEncodeAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__NBestEncodeAsPieces", _wrap_SentencePieceProcessor__NBestEncodeAsPieces, METH_VARARGS, NULL}, { "SentencePieceProcessor__NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsImmutableProto", _wrap_SentencePieceProcessor__NBestEncodeAsImmutableProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__CalculateEntropy", _wrap_SentencePieceProcessor__CalculateEntropy, METH_VARARGS, NULL}, { "SentencePieceProcessor__CalculateEntropyBatch", _wrap_SentencePieceProcessor__CalculateEntropyBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor_swigregister", SentencePieceProcessor_swigregister, METH_O, NULL}, @@ -7076,6 +8384,9 @@ static PyMethodDef SwigMethods_proxydocs[] = { static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_sentencepiece__ImmutableNBestSentencePieceText = {"_p_sentencepiece__ImmutableNBestSentencePieceText", "sentencepiece::ImmutableNBestSentencePieceText *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_sentencepiece__ImmutableSentencePieceText = {"_p_sentencepiece__ImmutableSentencePieceText", "sentencepiece::ImmutableSentencePieceText *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece = {"_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece", "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentenceIterator = {"_p_sentencepiece__SentenceIterator", "sentencepiece::SentenceIterator *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sentencepiece__SentencePieceProcessor", "sentencepiece::SentencePieceProcessor *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; @@ -7089,6 +8400,9 @@ static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__ve static swig_type_info *swig_type_initial[] = { &_swigt__p_char, &_swigt__p_float, + &_swigt__p_sentencepiece__ImmutableNBestSentencePieceText, + &_swigt__p_sentencepiece__ImmutableSentencePieceText, + &_swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, &_swigt__p_sentencepiece__SentenceIterator, &_swigt__p_sentencepiece__SentencePieceProcessor, &_swigt__p_sentencepiece__SentencePieceTrainer, @@ -7102,6 +8416,9 @@ static swig_type_info *swig_type_initial[] = { static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_float[] = { {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_sentencepiece__ImmutableNBestSentencePieceText[] = { {&_swigt__p_sentencepiece__ImmutableNBestSentencePieceText, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_sentencepiece__ImmutableSentencePieceText[] = { {&_swigt__p_sentencepiece__ImmutableSentencePieceText, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece[] = { {&_swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentenceIterator[] = { {&_swigt__p_sentencepiece__SentenceIterator, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_swigt__p_sentencepiece__SentencePieceProcessor, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; @@ -7115,6 +8432,9 @@ static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swig static swig_cast_info *swig_cast_initial[] = { _swigc__p_char, _swigc__p_float, + _swigc__p_sentencepiece__ImmutableNBestSentencePieceText, + _swigc__p_sentencepiece__ImmutableSentencePieceText, + _swigc__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, _swigc__p_sentencepiece__SentenceIterator, _swigc__p_sentencepiece__SentencePieceProcessor, _swigc__p_sentencepiece__SentencePieceTrainer, diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index 6c48bcd..2f2c84a 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -287,16 +287,44 @@ class TestSentencepieceProcessor(unittest.TestCase): ids2 = self.sp_.EncodeAsIds(text2) pieces = self.sp_.EncodeAsPieces(text) pieces2 = self.sp_.EncodeAsPieces(text2) - protos = self.sp_.EncodeAsSerializedProto(text) - proto2 = self.sp_.EncodeAsSerializedProto(text2) + sprotos = self.sp_.EncodeAsSerializedProto(text) + sproto2 = self.sp_.EncodeAsSerializedProto(text2) + iprotos = self.sp_.EncodeAsImmutableProto(text) + iprotos2 = self.sp_.EncodeAsImmutableProto(text2) self.assertEqual(sp.encode(text, out_type=int), ids) self.assertEqual(sp.encode(text, out_type=str), pieces) - self.assertEqual(sp.encode(text, out_type='proto'), protos) + self.assertEqual(sp.encode(text, out_type='serialized_proto'), sprotos) + self.assertEqual(sp.encode(text, out_type='immutable_proto'), iprotos) self.assertEqual(sp.encode([text], out_type=int), [ids]) self.assertEqual(sp.encode([text], out_type=str), [pieces]) - self.assertEqual(sp.encode([text], out_type='proto'), [protos]) + self.assertEqual(sp.encode([text], out_type='serialized_proto'), [sprotos]) + self.assertEqual(sp.encode([text], out_type='immutable_proto'), [iprotos]) + + self.assertEqual(len(iprotos), len(pieces)) + self.assertEqual(len(iprotos), len(ids)) + self.assertEqual(iprotos.text(), text) + + self.assertEqual(len(iprotos2), len(pieces2)) + self.assertEqual(len(iprotos2), len(ids2)) + self.assertEqual(iprotos2.text(), text2) + + for i in range(len(iprotos)): + self.assertEqual(ids[i], iprotos.pieces(i).id()) + self.assertEqual(pieces[i], iprotos.pieces(i).piece()) + + for i, piece in enumerate(iprotos): + self.assertEqual(ids[i], piece.id()) + self.assertEqual(pieces[i], piece.piece()) + + for i in range(len(iprotos2)): + self.assertEqual(ids2[i], iprotos2.pieces(i).id()) + self.assertEqual(pieces2[i], iprotos2.pieces(i).piece()) + + for i, piece in enumerate(iprotos2): + self.assertEqual(ids2[i], piece.id()) + self.assertEqual(pieces2[i], piece.piece()) detok_ids = self.sp_.DecodeIds(ids) detok_pieces = self.sp_.DecodePieces(pieces) @@ -464,19 +492,29 @@ class TestSentencepieceProcessor(unittest.TestCase): self.assertEqual(d1, d4) self.assertEqual(d1, d5) - r1 = sp.encode(texts, out_type='proto', num_threads=None) - r2 = sp.encode(texts, out_type='proto', num_threads=1) - r3 = sp.encode(texts, out_type='proto', num_threads=-1) - r4 = sp.encode(texts, out_type='proto', num_threads=8) - r5 = [sp.encode(s, out_type='proto') for s in texts] + r1 = sp.encode(texts, out_type='serialized_proto', num_threads=None) + r2 = sp.encode(texts, out_type='serialized_proto', num_threads=1) + r3 = sp.encode(texts, out_type='serialized_proto', num_threads=-1) + r4 = sp.encode(texts, out_type='serialized_proto', num_threads=8) + r5 = [sp.encode(s, out_type='serialized_proto') for s in texts] + self.assertEqual(r1, r2) + self.assertEqual(r1, r3) + self.assertEqual(r1, r4) + self.assertEqual(r1, r5) + + r1 = sp.encode(texts, out_type='immutable_proto', num_threads=None) + r2 = sp.encode(texts, out_type='immutable_proto', num_threads=1) + r3 = sp.encode(texts, out_type='immutable_proto', num_threads=-1) + r4 = sp.encode(texts, out_type='immutable_proto', num_threads=8) + r5 = [sp.encode(s, out_type='immutable_proto') for s in texts] self.assertEqual(r1, r2) self.assertEqual(r1, r3) self.assertEqual(r1, r4) self.assertEqual(r1, r5) - e1 = sp.calculate_entropy(texts, theta=1.0, num_threads=10) - e2 = sp.CalculateEntropy(texts, theta=1.0, num_threads=10) - e3 = [sp.calculate_entropy(s, theta=1.0) for s in texts] + e1 = sp.calculate_entropy(texts, alpha=1.0, num_threads=10) + e2 = sp.CalculateEntropy(texts, alpha=1.0, num_threads=10) + e3 = [sp.calculate_entropy(s, alpha=1.0) for s in texts] self.assertEqual(e1, e2) self.assertEqual(e1, e3) diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc index 805e0f9..482a45b 100644 --- a/src/sentencepiece_processor.cc +++ b/src/sentencepiece_processor.cc @@ -54,65 +54,70 @@ std::vector ToPieceArray(const std::vector &v) { for (int i = 0; i < v.size(); ++i) out[i] = v[i]; return out; } + } // namespace -ImmutableSentencePieceText::ImmutableSentencePieceText() {} -ImmutableSentencePieceText::~ImmutableSentencePieceText() {} +ImmutableSentencePieceText::ImmutableSentencePieceText() + : spt_(&SentencePieceText::default_instance()) {} ImmutableSentencePieceText::ImmutableSentencePieceText( const SentencePieceText &spt) : spt_(&spt) {} -ImmutableSentencePieceText::ImmutableSentencePiece::ImmutableSentencePiece( - const SentencePieceText_SentencePiece &sp) +ImmutableSentencePieceText::~ImmutableSentencePieceText() {} + +ImmutableSentencePieceText_ImmutableSentencePiece:: + ImmutableSentencePieceText_ImmutableSentencePiece() + : sp_(&SentencePieceText_SentencePiece::default_instance()) {} + +ImmutableSentencePieceText_ImmutableSentencePiece:: + ImmutableSentencePieceText_ImmutableSentencePiece( + const SentencePieceText_SentencePiece &sp) : sp_(&sp) {} -const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::piece() +const std::string &ImmutableSentencePieceText_ImmutableSentencePiece::piece() const { return sp_->piece(); } -const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::surface() +const std::string &ImmutableSentencePieceText_ImmutableSentencePiece::surface() const { return sp_->surface(); } -uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::id() const { +uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::id() const { return sp_->id(); } -uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::begin() const { +uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::begin() const { return sp_->begin(); } -uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::end() const { +uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::end() const { return sp_->end(); } -std::vector +std::vector ImmutableSentencePieceText::pieces() const { - std::vector pieces; - if (spt_ == nullptr) return pieces; - pieces.reserve(spt_->pieces_size()); + std::vector pieces( + spt_->pieces_size()); for (int i = 0; i < spt_->pieces_size(); ++i) - pieces[i] = ImmutableSentencePiece(spt_->pieces(i)); + pieces[i] = + ImmutableSentencePieceText_ImmutableSentencePiece(spt_->pieces(i)); return pieces; } size_t ImmutableSentencePieceText::pieces_size() const { - return spt_ ? spt_->pieces_size() : 0; + return spt_->pieces_size(); } -ImmutableSentencePieceText::ImmutableSentencePiece +ImmutableSentencePieceText_ImmutableSentencePiece ImmutableSentencePieceText::pieces(int index) const { - return ImmutableSentencePieceText::ImmutableSentencePiece( - spt_->pieces(index)); + return ImmutableSentencePieceText_ImmutableSentencePiece(spt_->pieces(index)); } const std::string &ImmutableSentencePieceText::text() const { - if (spt_) return spt_->text(); - static std::string *kEmptyString = new std::string(); - return *kEmptyString; + return spt_->text(); } float ImmutableSentencePieceText::score() const { @@ -127,8 +132,8 @@ SentencePieceText *ImmutableSentencePieceText::mutable_proto() { return rep_.get(); } -std::string ImmutableSentencePieceText::SerializeAsString() const { - return spt_ ? spt_->SerializeAsString() : ""; +util::bytes ImmutableSentencePieceText::SerializeAsString() const { + return spt_->SerializeAsString(); } ImmutableNBestSentencePieceText::ImmutableNBestSentencePieceText() {} @@ -145,9 +150,8 @@ ImmutableSentencePieceText ImmutableNBestSentencePieceText::nbests( std::vector ImmutableNBestSentencePieceText::nbests() const { - std::vector nbests; - if (rep_ == nullptr) return nbests; - nbests.reserve(rep_->nbests_size()); + if (rep_ == nullptr) return {}; + std::vector nbests(rep_->nbests_size()); for (int i = 0; i < rep_->nbests_size(); ++i) nbests[i] = ImmutableSentencePieceText(rep_->nbests(i)); return nbests; @@ -160,7 +164,7 @@ NBestSentencePieceText *ImmutableNBestSentencePieceText::mutable_proto() { return rep_.get(); } -std::string ImmutableNBestSentencePieceText::SerializeAsString() const { +util::bytes ImmutableNBestSentencePieceText::SerializeAsString() const { return rep_ ? rep_->SerializeAsString() : ""; } @@ -1044,8 +1048,35 @@ std::string SentencePieceProcessor::serialized_model_proto() const { // std::random_device. void SetRandomGeneratorSeed(unsigned int seed); -namespace io { +void ConvertToUnicodeSpans(SentencePieceText *spt) { + if (spt == nullptr) return; + + std::vector utf8_to_unicode(spt->text().size() + 1, 0); + absl::string_view str = spt->text(); + size_t prev = 0; + int ulen = 0; + while (!str.empty()) { + const size_t mblen = string_util::OneCharLen(str.data()); + for (int i = prev; i < prev + mblen; ++i) { + utf8_to_unicode[i] = ulen; + } + ++ulen; + prev += mblen; + str.remove_prefix(mblen); + } + utf8_to_unicode[prev] = ulen; + + auto clip = [&](int s) { + return std::min(std::max(0, s), utf8_to_unicode.size() - 1); + }; + for (auto &piece : *(spt->mutable_pieces())) { + piece.set_begin(utf8_to_unicode[clip(piece.begin())]); + piece.set_end(utf8_to_unicode[clip(piece.end())]); + } +} + +namespace io { util::Status LoadModelProto(absl::string_view filename, ModelProto *model_proto) { if (filename.empty()) { diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h index 8124c59..b7fae6a 100644 --- a/src/sentencepiece_processor.h +++ b/src/sentencepiece_processor.h @@ -157,35 +157,39 @@ class SentencePieceText_SentencePiece; // This wrapper only allows an immutable access to the proto and // hides the actual implementation of protobuf. // See sentencepiece.proto for the details of this class. +class ImmutableSentencePieceText_ImmutableSentencePiece { + public: + ImmutableSentencePieceText_ImmutableSentencePiece(); + ~ImmutableSentencePieceText_ImmutableSentencePiece() = default; + + const std::string &piece() const; + const std::string &surface() const; + uint32_t id() const; + uint32_t begin() const; + uint32_t end() const; + + friend class ImmutableSentencePieceText; + + private: + explicit ImmutableSentencePieceText_ImmutableSentencePiece( + const SentencePieceText_SentencePiece &sp); + const SentencePieceText_SentencePiece *sp_ = nullptr; +}; + class ImmutableSentencePieceText { public: ImmutableSentencePieceText(); virtual ~ImmutableSentencePieceText(); - class ImmutableSentencePiece { - public: - ~ImmutableSentencePiece() = default; - const std::string &piece() const; - const std::string &surface() const; - uint32_t id() const; - uint32_t begin() const; - uint32_t end() const; + std::vector pieces() const; - friend class ImmutableSentencePieceText; - - private: - ImmutableSentencePiece() = default; - explicit ImmutableSentencePiece(const SentencePieceText_SentencePiece &sp); - const SentencePieceText_SentencePiece *sp_ = nullptr; - }; - - std::vector pieces() const; size_t pieces_size() const; - ImmutableSentencePiece pieces(int index) const; + ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const; + const std::string &text() const; float score() const; - std::string SerializeAsString() const; + util::bytes SerializeAsString() const; // Returns the actual mutable proto. // Do not use this outside of SentencePieceProcessor, as @@ -214,7 +218,7 @@ class ImmutableNBestSentencePieceText { size_t nbests_size() const; ImmutableSentencePieceText nbests(int index) const; - std::string SerializeAsString() const; + util::bytes SerializeAsString() const; // Returns the actual mutable proto. // Do not use this outside of SentencePieceProcessor, as @@ -398,7 +402,7 @@ class SentencePieceProcessor { float alpha, SentencePieceText *spt) const; virtual util::Status SampleEncodeAndScore( - absl::string_view input, int samples, float alpha, bool wor, + absl::string_view input, int num_samples, float alpha, bool wor, bool include_best, NBestSentencePieceText *samples_spt) const; // DEPRECATED: Remove this API and use std::vector @@ -534,11 +538,11 @@ class SentencePieceProcessor { } virtual util::bytes SampleEncodeAndScoreAsSerializedProto( - absl::string_view input, int samples, float alpha, bool wor, - bool include_best, int nbest_size) const { + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best) const { DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore, ImmutableNBestSentencePieceText, input, - samples, alpha, wor, include_best); + num_samples, alpha, wor, include_best); } // TODO(taku): Remove this API and use std::vector @@ -579,11 +583,11 @@ class SentencePieceProcessor { } virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto( - absl::string_view input, int samples, float alpha, bool wor, - bool include_best, int nbest_size) const { + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best) const { DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore, ImmutableNBestSentencePieceText, input, - samples, alpha, wor, include_best); + num_samples, alpha, wor, include_best); } // TODO(taku): Remove this API and use std::vector @@ -703,6 +707,9 @@ class SentencePieceProcessor { // std::random_device. void SetRandomGeneratorSeed(unsigned int seed); +// Converts the utf8 byte spans into Unicode char span. +void ConvertToUnicodeSpans(SentencePieceText *spt); + #ifndef SWIG // IO related functions to absorb model formats. namespace io { diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc index ed651f7..ff55aeb 100644 --- a/src/sentencepiece_processor_test.cc +++ b/src/sentencepiece_processor_test.cc @@ -1564,6 +1564,10 @@ TEST(SentencePieceProcessorTest, VocabularyTest) { TEST(SentencePieceProcessorTest, ImmutableSentencePieceTextTest) { ImmutableSentencePieceText spt; + EXPECT_TRUE(spt.text().empty()); + EXPECT_EQ(spt.score(), 0.0); + EXPECT_TRUE(spt.SerializeAsString().empty()); + auto *v = spt.mutable_proto(); v->set_text("hello world"); @@ -1586,52 +1590,123 @@ TEST(SentencePieceProcessorTest, ImmutableSentencePieceTextTest) { EXPECT_EQ(v->pieces(i).end(), spt.pieces(i).end()); } - int n = 0; - for (auto &p : spt.pieces()) { - EXPECT_EQ(v->pieces(n).surface(), p.surface()); - EXPECT_EQ(v->pieces(n).piece(), p.piece()); - EXPECT_EQ(v->pieces(n).id(), p.id()); - EXPECT_EQ(v->pieces(n).begin(), p.begin()); - EXPECT_EQ(v->pieces(n).end(), p.end()); - ++n; - } - - EXPECT_EQ(v->text(), spt.text()); - EXPECT_EQ(v->score(), spt.score()); - EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); + auto check_proto = [&v](const ImmutableSentencePieceText &s) { + int n = 0; + for (auto &p : s.pieces()) { + EXPECT_EQ(v->pieces(n).surface(), p.surface()); + EXPECT_EQ(v->pieces(n).piece(), p.piece()); + EXPECT_EQ(v->pieces(n).id(), p.id()); + EXPECT_EQ(v->pieces(n).begin(), p.begin()); + EXPECT_EQ(v->pieces(n).end(), p.end()); + ++n; + } + EXPECT_EQ(v->text(), s.text()); + EXPECT_EQ(v->score(), s.score()); + EXPECT_EQ(v->SerializeAsString(), s.SerializeAsString()); + }; // test copy. - auto spt2 = spt; - EXPECT_EQ(spt2.pieces_size(), spt.pieces_size()); - for (int i = 0; i < spt.pieces_size(); ++i) { - EXPECT_EQ(spt2.pieces(i).surface(), spt.pieces(i).surface()); - EXPECT_EQ(spt2.pieces(i).piece(), spt.pieces(i).piece()); - EXPECT_EQ(spt2.pieces(i).id(), spt.pieces(i).id()); - EXPECT_EQ(spt2.pieces(i).begin(), spt.pieces(i).begin()); - EXPECT_EQ(spt2.pieces(i).end(), spt.pieces(i).end()); - } + const auto spt2 = spt; + check_proto(spt2); + + // test assign. + const ImmutableSentencePieceText spt3(spt); + check_proto(spt3); + + // default piece. + const ImmutableSentencePieceText_ImmutableSentencePiece piece; + EXPECT_TRUE(piece.surface().empty()); + EXPECT_TRUE(piece.piece().empty()); + EXPECT_EQ(piece.begin(), 0); + EXPECT_EQ(piece.end(), 0); + EXPECT_EQ(piece.id(), 0); } TEST(SentencePieceProcessorTest, ImmutableNBestSentencePieceTextTest) { ImmutableNBestSentencePieceText spt; + EXPECT_EQ(spt.nbests_size(), 0); + EXPECT_TRUE(spt.SerializeAsString().empty()); + auto *v = spt.mutable_proto(); + for (int i = 0; i < 10; ++i) { auto *p = v->add_nbests(); p->set_text(absl::StrCat("text_", i)); p->set_score(2.0 * i); } - EXPECT_EQ(v->nbests_size(), spt.nbests_size()); - for (int i = 0; i < v->nbests_size(); ++i) { - EXPECT_EQ(v->nbests(i).text(), spt.nbests(i).text()); - EXPECT_EQ(v->nbests(i).score(), spt.nbests(i).score()); - } - EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); + auto check_proto = [&v](const ImmutableNBestSentencePieceText &s) { + EXPECT_EQ(v->nbests_size(), s.nbests_size()); + for (int i = 0; i < v->nbests_size(); ++i) { + EXPECT_EQ(v->nbests(i).text(), s.nbests(i).text()); + EXPECT_EQ(v->nbests(i).score(), s.nbests(i).score()); + } + EXPECT_EQ(v->SerializeAsString(), s.SerializeAsString()); + }; + + check_proto(spt); // test copy. - auto spt2 = spt; - EXPECT_EQ(spt2.nbests_size(), spt.nbests_size()); - EXPECT_EQ(spt2.SerializeAsString(), spt.SerializeAsString()); + const auto spt2 = spt; + check_proto(spt2); + + // test assign. + const ImmutableNBestSentencePieceText spt3(spt); + check_proto(spt3); +} + +TEST(SentencePieceProcessorTest, ConvertToUnicodeSpansTest) { + auto make_spt = [&](const std::vector &tokens) { + SentencePieceText spt; + int prev = 0; + std::string text; + for (const auto &tok : tokens) { + auto *piece = spt.add_pieces(); + piece->set_surface(tok); + piece->set_piece(tok); + piece->set_begin(prev); + piece->set_end(prev + tok.size()); + prev += tok.size(); + text += tok; + } + spt.set_text(text); + ConvertToUnicodeSpans(&spt); + return spt; + }; + + { + const auto spt = make_spt({"hello", "_world", "."}); + EXPECT_EQ(spt.pieces_size(), 3); + EXPECT_EQ(spt.pieces(0).begin(), 0); + EXPECT_EQ(spt.pieces(0).end(), 5); + EXPECT_EQ(spt.pieces(1).begin(), 5); + EXPECT_EQ(spt.pieces(1).end(), 11); + EXPECT_EQ(spt.pieces(2).begin(), 11); + EXPECT_EQ(spt.pieces(2).end(), 12); + } + + { + const auto spt = make_spt({"これは", "test", "です"}); + EXPECT_EQ(spt.pieces_size(), 3); + EXPECT_EQ(spt.pieces(0).begin(), 0); + EXPECT_EQ(spt.pieces(0).end(), 3); + EXPECT_EQ(spt.pieces(1).begin(), 3); + EXPECT_EQ(spt.pieces(1).end(), 7); + + EXPECT_EQ(spt.pieces(2).begin(), 7); + EXPECT_EQ(spt.pieces(2).end(), 9); + } + + { + const auto spt = make_spt({"いABは", "にほCD", "へと"}); + EXPECT_EQ(spt.pieces_size(), 3); + EXPECT_EQ(spt.pieces(0).begin(), 0); + EXPECT_EQ(spt.pieces(0).end(), 4); + EXPECT_EQ(spt.pieces(1).begin(), 4); + EXPECT_EQ(spt.pieces(1).end(), 8); + EXPECT_EQ(spt.pieces(2).begin(), 8); + EXPECT_EQ(spt.pieces(2).end(), 10); + } } } // namespace sentencepiece