From: Taku Kudo Date: Tue, 14 Jun 2022 16:29:55 +0000 (+0900) Subject: Uses absl::string_view as much as possible X-Git-Tag: archive/raspbian/0.1.97-3+rpi1^2~20 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=f620ff6921456347070df62336ae25e205c83719;p=sentencepiece.git Uses absl::string_view as much as possible Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0008-Uses-absl-string_view-as-much-as-possible.patch --- diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py index cba3b70..1543d32 100644 --- a/python/src/sentencepiece/__init__.py +++ b/python/src/sentencepiece/__init__.py @@ -93,8 +93,8 @@ class SentencePieceProcessor(object): def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) - def CalculateEntropy(self, text, theta): - return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, text, theta) + def CalculateEntropy(self, *args): + return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args) def GetPieceSize(self): return _sentencepiece.SentencePieceProcessor_GetPieceSize(self) diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i index 3a822bc..40373ce 100644 --- a/python/src/sentencepiece/sentencepiece.i +++ b/python/src/sentencepiece/sentencepiece.i @@ -37,6 +37,7 @@ class PyInputString { str_ = nullptr; } } + absl::string_view str() const { return absl::string_view(data(), size()); } const char* data() const { return str_; } Py_ssize_t size() const { return size_; } bool IsAvalable() const { return str_ != nullptr; } @@ -179,7 +180,7 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { } } -inline void CheckIds(const std::vector &ids, int num_pieces) {} +inline void CheckIds(const std::vector &ids, int num_pieces) {} class ThreadPool { public: @@ -266,6 +267,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { %ignore sentencepiece::util::Status; %ignore sentencepiece::util::StatusCode; %ignore absl::string_view; +%ignore std::string_view; %ignore sentencepiece::SentencePieceText; %ignore sentencepiece::NormalizerSpec; %ignore sentencepiece::TrainerSpec; @@ -386,7 +388,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { return $self->DecodeIds(ids); } - std::string _DecodePieces(const std::vector &pieces) const { + std::string _DecodePieces(const std::vector &pieces) const { return $self->DecodePieces(pieces); } @@ -397,7 +399,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } sentencepiece::util::bytes _DecodePiecesAsSerializedProto( - const std::vector &pieces) const { + const std::vector &pieces) const { CheckIds(pieces, $self->GetPieceSize()); return $self->DecodePiecesAsSerializedProto(pieces); } @@ -416,12 +418,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { } std::vector _DecodePiecesBatch( - const std::vector> &ins, int num_threads) const { + const std::vector> &ins, int num_threads) const { DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); } BytesArray _DecodePiecesAsSerializedProtoBatch( - const std::vector> &ins, int num_threads) const { + const std::vector> &ins, int num_threads) const { DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, sentencepiece::util::bytes); } @@ -1029,14 +1031,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { %typemap(out) std::vector { $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { - PyList_SetItem($result, i, PyInt_FromLong(static_cast($1[i]))); + PyList_SET_ITEM($result, i, PyInt_FromLong(static_cast($1[i]))); } } %typemap(out) std::vector { $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { - PyList_SetItem($result, i, PyFloat_FromDouble(static_cast($1[i]))); + PyList_SET_ITEM($result, i, PyFloat_FromDouble(static_cast($1[i]))); } } @@ -1045,9 +1047,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t i = 0; i < $1.size(); ++i) { PyObject *obj = PyList_New($1[i].size()); for (size_t j = 0; j < $1[i].size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast($1[i][j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast($1[i][j]))); } - PyList_SetItem($result, i, obj); + PyList_SET_ITEM($result, i, obj); } } @@ -1055,14 +1057,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { PyObject *input_type = resultobj; $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { - PyList_SetItem($result, i, MakePyOutputString($1[i], input_type)); + PyList_SET_ITEM($result, i, MakePyOutputString($1[i], input_type)); } } %typemap(out) BytesArray { $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { - PyList_SetItem($result, i, MakePyOutputBytes($1[i])); + PyList_SET_ITEM($result, i, MakePyOutputBytes($1[i])); } } @@ -1072,9 +1074,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t i = 0; i < $1.size(); ++i) { PyObject *obj = PyList_New($1[i].size()); for (size_t j = 0; j < $1[i].size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString($1[i][j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString($1[i][j], input_type)); } - PyList_SetItem($result, i, obj); + PyList_SET_ITEM($result, i, obj); } } @@ -1118,51 +1120,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { SWIG_fail; } resultobj = ustring.input_type(); - $1 = absl::string_view(ustring.data(), ustring.size()); -} - -%typemap(in) const std::vector& { - std::vector *out = nullptr; - if (PyList_Check($input)) { - const size_t size = PyList_Size($input); - out = new std::vector(size); - for (size_t i = 0; i < size; ++i) { - const PyInputString ustring(PyList_GetItem($input, i)); - if (ustring.IsAvalable()) { - (*out)[i].assign(ustring.data(), ustring.size()); - } else { - PyErr_SetString(PyExc_TypeError, "list must contain strings"); - SWIG_fail; - } - resultobj = ustring.input_type(); - } - } else { - PyErr_SetString(PyExc_TypeError, "not a list"); - SWIG_fail; - } - $1 = out; -} - -%typemap(in) const std::vector& { - std::vector *out = nullptr; - if (PyList_Check($input)) { - const size_t size = PyList_Size($input); - out = new std::vector(size); - for (size_t i = 0; i < size; ++i) { - const PyInputString ustring(PyList_GetItem($input, i)); - if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); - } else { - PyErr_SetString(PyExc_TypeError, "list must contain strings"); - SWIG_fail; - } - resultobj = ustring.input_type(); - } - } else { - PyErr_SetString(PyExc_TypeError, "not a list"); - SWIG_fail; - } - $1 = out; + $1 = ustring.str(); } %typemap(in) const std::vector& { @@ -1173,7 +1131,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem($input, i)); if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -1208,11 +1166,11 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { $1 = out; } -%typemap(in) const std::vector>& { - std::vector> *out = nullptr; +%typemap(in) const std::vector>& { + std::vector> *out = nullptr; if (PyList_Check($input)) { const size_t size = PyList_Size($input); - out = new std::vector>(size); + out = new std::vector>(size); for (size_t i = 0; i < size; ++i) { PyObject *o = PyList_GetItem($input, i); if (PyList_Check(o)) { @@ -1221,7 +1179,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t j = 0; j < size2; ++j) { const PyInputString ustring(PyList_GetItem(o, j)); if (ustring.IsAvalable()) { - (*out)[i][j].assign(ustring.data(), ustring.size()); + (*out)[i][j] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError,"list must contain integers"); SWIG_fail; @@ -1302,9 +1260,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t i = 0; i < $1.size(); ++i) { PyObject *obj = PyList_New($1[i].first.size()); for (size_t j = 0; j < $1[i].first.size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString($1[i].first[j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString($1[i].first[j], input_type)); } - PyList_SetItem($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); + PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); } } @@ -1313,9 +1271,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { for (size_t i = 0; i < $1.size(); ++i) { PyObject *obj = PyList_New($1[i].first.size()); for (size_t j = 0; j < $1[i].first.size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast($1[i].first[j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast($1[i].first[j]))); } - PyList_SetItem($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); + PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); } } diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx index 6df3880..36ce38c 100644 --- a/python/src/sentencepiece/sentencepiece_wrap.cxx +++ b/python/src/sentencepiece/sentencepiece_wrap.cxx @@ -2693,16 +2693,16 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO /* -------- TYPES TABLE (BEGIN) -------- */ #define SWIGTYPE_p_char swig_types[0] -#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[1] -#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[2] -#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[3] -#define SWIGTYPE_p_std__string swig_types[4] -#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[5] -#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[6] -#define SWIGTYPE_p_std__vectorT_int_t swig_types[7] -#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[8] -#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[9] -#define SWIGTYPE_p_std__vectorT_std__vectorT_std__string_t_t swig_types[10] +#define SWIGTYPE_p_float swig_types[1] +#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[2] +#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[3] +#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[4] +#define SWIGTYPE_p_std__string swig_types[5] +#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[6] +#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[7] +#define SWIGTYPE_p_std__vectorT_int_t swig_types[8] +#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[9] +#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[10] static swig_type_info *swig_types[12]; static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) @@ -2843,6 +2843,7 @@ class PyInputString { str_ = nullptr; } } + absl::string_view str() const { return absl::string_view(data(), size()); } const char* data() const { return str_; } Py_ssize_t size() const { return size_; } bool IsAvalable() const { return str_ != nullptr; } @@ -2985,7 +2986,7 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { } } -inline void CheckIds(const std::vector &ids, int num_pieces) {} +inline void CheckIds(const std::vector &ids, int num_pieces) {} class ThreadPool { public: @@ -3473,14 +3474,14 @@ SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencep CheckIds(ids, self->GetPieceSize()); return self->DecodeIds(ids); } -SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ +SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ return self->DecodePieces(pieces); } SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ CheckIds(ids, self->GetPieceSize()); return self->DecodeIdsAsSerializedProto(ids); } -SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ CheckIds(pieces, self->GetPieceSize()); return self->DecodePiecesAsSerializedProto(pieces); } @@ -3491,10 +3492,10 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodeIdsAsSerialize DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, sentencepiece::util::bytes); } -SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ +SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); } -SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ +SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, sentencepiece::util::bytes); } @@ -3718,7 +3719,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromSerializedProto(PyObje SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } { try { @@ -3763,7 +3764,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetEncodeExtraOptions(PyObject SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } { try { @@ -3808,7 +3809,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetDecodeExtraOptions(PyObject SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } { try { @@ -3834,7 +3835,7 @@ fail: SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::string > *arg2 = 0 ; + std::vector< absl::string_view > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; PyObject *swig_obj[2] ; @@ -3847,14 +3848,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector *out = nullptr; + std::vector *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); + out = new std::vector(size); for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i].assign(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -3869,7 +3870,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN } { try { - result = (arg1)->SetVocabulary((std::vector< std::string > const &)*arg2); + result = (arg1)->SetVocabulary((std::vector< absl::string_view > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -3955,7 +3956,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadVocabulary(PyObject *SWIGU SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -3983,6 +3984,66 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObject *SWIGUNUSEDPARM(self), Py_ssize_t nobjs, PyObject **swig_obj) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + float arg3 ; + float *arg4 = (float *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + float val3 ; + int ecode3 = 0 ; + void *argp4 = 0 ; + int res4 = 0 ; + sentencepiece::util::Status result; + + if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); + } + arg3 = static_cast< float >(val3); + res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); + if (!SWIG_IsOK(res4)) { + SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); + } + arg4 = reinterpret_cast< float * >(argp4); + { + try { + result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + if (!(&result)->ok()) { + SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); + } + resultobj = SWIG_From_bool((&result)->ok()); + } + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; @@ -4017,7 +4078,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(P SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -4054,9 +4115,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(P for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].first.size()); for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -4099,7 +4160,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -4135,9 +4196,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].first.size()); for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -4146,7 +4207,7 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_1(PyObject *SWIGUNUSEDPARM(self), Py_ssize_t nobjs, PyObject **swig_obj) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; @@ -4155,10 +4216,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWI int res1 = 0 ; float val3 ; int ecode3 = 0 ; - PyObject *swig_obj[3] ; float result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; + if ((nobjs < 3) || (nobjs > 3)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); @@ -4171,7 +4231,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWI SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -4194,6 +4254,67 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *self, PyObject *args) { + Py_ssize_t argc; + PyObject *argv[5] = { + 0 + }; + + if (!(argc = SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 0, 4, argv))) SWIG_fail; + --argc; + if (argc == 3) { + int _v; + void *vptr = 0; + int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0); + _v = SWIG_CheckState(res); + if (_v) { + int res = SWIG_AsCharPtrAndSize(argv[1], 0, NULL, 0); + _v = SWIG_CheckState(res); + if (_v) { + { + int res = SWIG_AsVal_float(argv[2], NULL); + _v = SWIG_CheckState(res); + } + if (_v) { + return _wrap_SentencePieceProcessor_CalculateEntropy__SWIG_1(self, argc, argv); + } + } + } + } + if (argc == 4) { + int _v; + void *vptr = 0; + int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0); + _v = SWIG_CheckState(res); + if (_v) { + int res = SWIG_AsCharPtrAndSize(argv[1], 0, NULL, 0); + _v = SWIG_CheckState(res); + if (_v) { + { + int res = SWIG_AsVal_float(argv[2], NULL); + _v = SWIG_CheckState(res); + } + if (_v) { + void *vptr = 0; + int res = SWIG_ConvertPtr(argv[3], &vptr, SWIGTYPE_p_float, 0); + _v = SWIG_CheckState(res); + if (_v) { + return _wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(self, argc, argv); + } + } + } + } + } + +fail: + SWIG_Python_RaiseOrModifyTypeError("Wrong number or type of arguments for overloaded function 'SentencePieceProcessor_CalculateEntropy'.\n" + " Possible C/C++ prototypes are:\n" + " sentencepiece::SentencePieceProcessor::CalculateEntropy(absl::string_view,float,float *) const\n" + " sentencepiece::SentencePieceProcessor::CalculateEntropy(absl::string_view,float) const\n"); + return 0; +} + + SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; @@ -4247,7 +4368,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSED SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } { try { @@ -4675,7 +4796,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNU SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } { try { @@ -4741,7 +4862,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -4790,7 +4911,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); + PyList_SET_ITEM(resultobj, i, PyInt_FromLong(static_cast(result[i]))); } } return resultobj; @@ -4842,7 +4963,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -4892,7 +5013,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); } } return resultobj; @@ -4944,7 +5065,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObj SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -5046,7 +5167,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -5113,9 +5234,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); } - PyList_SetItem(resultobj, i, obj); + PyList_SET_ITEM(resultobj, i, obj); } } { @@ -5177,7 +5298,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -5245,9 +5366,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); } - PyList_SetItem(resultobj, i, obj); + PyList_SET_ITEM(resultobj, i, obj); } } { @@ -5309,7 +5430,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -5374,7 +5495,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); } } { @@ -5452,7 +5573,7 @@ fail: SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::string > *arg2 = 0 ; + std::vector< absl::string_view > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; PyObject *swig_obj[2] ; @@ -5465,14 +5586,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUN } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector *out = nullptr; + std::vector *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); + out = new std::vector(size); for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i].assign(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -5487,7 +5608,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUN } { try { - result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); + result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5572,7 +5693,7 @@ fail: SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::string > *arg2 = 0 ; + std::vector< absl::string_view > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; PyObject *swig_obj[2] ; @@ -5585,14 +5706,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector *out = nullptr; + std::vector *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); + out = new std::vector(size); for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i].assign(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -5607,7 +5728,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto } { try { - result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5695,7 +5816,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIG PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); } } { @@ -5775,7 +5896,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBat { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); } } { @@ -5793,7 +5914,7 @@ fail: SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::vector< std::string > > *arg2 = 0 ; + std::vector< std::vector< absl::string_view > > *arg2 = 0 ; int arg3 ; void *argp1 = 0 ; int res1 = 0 ; @@ -5809,10 +5930,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector> *out = nullptr; + std::vector> *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector>(size); + out = new std::vector>(size); for (size_t i = 0; i < size; ++i) { PyObject *o = PyList_GetItem(swig_obj[1], i); if (PyList_Check(o)) { @@ -5821,7 +5942,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S for (size_t j = 0; j < size2; ++j) { const PyInputString ustring(PyList_GetItem(o, j)); if (ustring.IsAvalable()) { - (*out)[i][j].assign(ustring.data(), ustring.size()); + (*out)[i][j] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError,"list must contain integers"); SWIG_fail; @@ -5846,7 +5967,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); + result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5857,17 +5978,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); } } - { - delete arg2; - } return resultobj; fail: - { - delete arg2; - } return NULL; } @@ -5875,7 +5990,7 @@ fail: SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::vector< std::string > > *arg2 = 0 ; + std::vector< std::vector< absl::string_view > > *arg2 = 0 ; int arg3 ; void *argp1 = 0 ; int res1 = 0 ; @@ -5891,10 +6006,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector> *out = nullptr; + std::vector> *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector>(size); + out = new std::vector>(size); for (size_t i = 0; i < size; ++i) { PyObject *o = PyList_GetItem(swig_obj[1], i); if (PyList_Check(o)) { @@ -5903,7 +6018,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto for (size_t j = 0; j < size2; ++j) { const PyInputString ustring(PyList_GetItem(o, j)); if (ustring.IsAvalable()) { - (*out)[i][j].assign(ustring.data(), ustring.size()); + (*out)[i][j] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError,"list must contain integers"); SWIG_fail; @@ -5928,7 +6043,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5938,17 +6053,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); } } - { - delete arg2; - } return resultobj; fail: - { - delete arg2; - } return NULL; } @@ -5990,7 +6099,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6031,9 +6140,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); } - PyList_SetItem(resultobj, i, obj); + PyList_SET_ITEM(resultobj, i, obj); } } return resultobj; @@ -6079,7 +6188,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6121,9 +6230,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); } - PyList_SetItem(resultobj, i, obj); + PyList_SET_ITEM(resultobj, i, obj); } } return resultobj; @@ -6169,7 +6278,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto( SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6260,7 +6369,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6316,9 +6425,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].first.size()); for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); + PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -6373,7 +6482,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6430,9 +6539,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].first.size()); for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); + PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -6466,7 +6575,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SW SWIG_fail; } resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = ustring.str(); } ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { @@ -6518,7 +6627,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObjec for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + (*out)[i] = ustring.str(); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -6553,7 +6662,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObjec { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); + PyList_SET_ITEM(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); } } { @@ -6623,7 +6732,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceTrainer__TrainFromString(PyObject *SWIGU SWIG_fail; } resultobj = ustring.input_type(); - arg1 = absl::string_view(ustring.data(), ustring.size()); + arg1 = ustring.str(); } { try { @@ -6966,6 +7075,7 @@ static PyMethodDef SwigMethods_proxydocs[] = { /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */ static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentenceIterator = {"_p_sentencepiece__SentenceIterator", "sentencepiece::SentenceIterator *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sentencepiece__SentencePieceProcessor", "sentencepiece::SentencePieceProcessor *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; @@ -6973,12 +7083,12 @@ static swig_type_info _swigt__p_std__string = {"_p_std__string", "sentencepiece: static swig_type_info _swigt__p_std__unordered_mapT_std__string_std__string_t = {"_p_std__unordered_mapT_std__string_std__string_t", "std::unordered_map< std::string,std::string > *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__vectorT_absl__string_view_t = {"_p_std__vectorT_absl__string_view_t", "std::vector< absl::string_view > *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__vectorT_int_t = {"_p_std__vectorT_int_t", "std::vector< int > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__string_t = {"_p_std__vectorT_std__string_t", "std::vector< std::string > *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t = {"_p_std__vectorT_std__vectorT_absl__string_view_t_t", "std::vector< std::vector< absl::string_view > > *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__vectorT_std__vectorT_int_t_t", "std::vector< std::vector< int > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__string_t_t = {"_p_std__vectorT_std__vectorT_std__string_t_t", "std::vector< std::vector< std::string > > *", 0, 0, (void*)0, 0}; static swig_type_info *swig_type_initial[] = { &_swigt__p_char, + &_swigt__p_float, &_swigt__p_sentencepiece__SentenceIterator, &_swigt__p_sentencepiece__SentencePieceProcessor, &_swigt__p_sentencepiece__SentencePieceTrainer, @@ -6986,12 +7096,12 @@ static swig_type_info *swig_type_initial[] = { &_swigt__p_std__unordered_mapT_std__string_std__string_t, &_swigt__p_std__vectorT_absl__string_view_t, &_swigt__p_std__vectorT_int_t, - &_swigt__p_std__vectorT_std__string_t, + &_swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t, &_swigt__p_std__vectorT_std__vectorT_int_t_t, - &_swigt__p_std__vectorT_std__vectorT_std__string_t_t, }; static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_float[] = { {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentenceIterator[] = { {&_swigt__p_sentencepiece__SentenceIterator, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_swigt__p_sentencepiece__SentencePieceProcessor, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; @@ -6999,12 +7109,12 @@ static swig_cast_info _swigc__p_std__string[] = { {&_swigt__p_std__string, 0, 0 static swig_cast_info _swigc__p_std__unordered_mapT_std__string_std__string_t[] = { {&_swigt__p_std__unordered_mapT_std__string_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__vectorT_absl__string_view_t[] = { {&_swigt__p_std__vectorT_absl__string_view_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__vectorT_int_t[] = { {&_swigt__p_std__vectorT_int_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__string_t[] = { {&_swigt__p_std__vectorT_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_std__vectorT_std__vectorT_absl__string_view_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_int_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__string_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__string_t_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info *swig_cast_initial[] = { _swigc__p_char, + _swigc__p_float, _swigc__p_sentencepiece__SentenceIterator, _swigc__p_sentencepiece__SentencePieceProcessor, _swigc__p_sentencepiece__SentencePieceTrainer, @@ -7012,9 +7122,8 @@ static swig_cast_info *swig_cast_initial[] = { _swigc__p_std__unordered_mapT_std__string_std__string_t, _swigc__p_std__vectorT_absl__string_view_t, _swigc__p_std__vectorT_int_t, - _swigc__p_std__vectorT_std__string_t, + _swigc__p_std__vectorT_std__vectorT_absl__string_view_t_t, _swigc__p_std__vectorT_std__vectorT_int_t_t, - _swigc__p_std__vectorT_std__vectorT_std__string_t_t, }; diff --git a/src/builder.cc b/src/builder.cc index 0fc7f24..822f6fc 100644 --- a/src/builder.cc +++ b/src/builder.cc @@ -272,7 +272,7 @@ util::Status Builder::DecompileCharsMap(absl::string_view blob, } // static -util::Status Builder::GetPrecompiledCharsMap(const std::string &name, +util::Status Builder::GetPrecompiledCharsMap(absl::string_view name, std::string *output) { CHECK_OR_RETURN(output); diff --git a/src/builder.h b/src/builder.h index 95c5168..094da72 100644 --- a/src/builder.h +++ b/src/builder.h @@ -51,7 +51,7 @@ class Builder { CharsMap *chars_map); // Returns a pre-compiled binary index with `name`. - static util::Status GetPrecompiledCharsMap(const std::string &name, + static util::Status GetPrecompiledCharsMap(absl::string_view name, std::string *output); // Makes a normalization mapping based on NFKC. diff --git a/src/common.h b/src/common.h index 6ec4c09..ab07d85 100644 --- a/src/common.h +++ b/src/common.h @@ -71,8 +71,7 @@ char (&ArraySizeHelper(const T (&array)[N]))[N]; namespace sentencepiece { #ifdef OS_WIN namespace win32 { -std::wstring Utf8ToWide(const std::string &input); -std::string WideToUtf8(const std::wstring &input); +std::wstring Utf8ToWide(const absl::string_view input); } // namespace win32 #endif diff --git a/src/error.cc b/src/error.cc index a226d98..10faa2d 100644 --- a/src/error.cc +++ b/src/error.cc @@ -61,15 +61,10 @@ struct Status::Rep { std::string error_message; }; -Status::Status(StatusCode code, const char* error_message) : rep_(new Rep) { - rep_->code = code; - rep_->error_message = error_message; -} - -Status::Status(StatusCode code, const std::string& error_message) +Status::Status(StatusCode code, absl::string_view error_message) : rep_(new Rep) { rep_->code = code; - rep_->error_message = error_message; + rep_->error_message = std::string(error_message); } Status::Status(const Status& s) diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc index 4d697be..331fc90 100644 --- a/src/sentencepiece_processor.cc +++ b/src/sentencepiece_processor.cc @@ -48,6 +48,12 @@ const char kDefaultUnknownSymbol[] = " \xE2\x81\x87 "; // REPLACEMENT CHARACTER (U+FFFD) in UTF-8. const char kReplacementCharacter[] = "\xef\xbf\xbd"; + +std::vector ToPieceArray(const std::vector &v) { + std::vector out(v.size()); + for (int i = 0; i < v.size(); ++i) out[i] = v[i]; + return out; +} } // namespace SentencePieceProcessor::SentencePieceProcessor() {} @@ -146,7 +152,7 @@ util::Status SentencePieceProcessor::status() const { } util::Status SentencePieceProcessor::SetVocabulary( - const std::vector &valid_vocab) { + const std::vector &valid_vocab) { RETURN_IF_ERROR(status()); // TODO(taku): supports vocabulary constraint in BPE model. @@ -154,7 +160,8 @@ util::Status SentencePieceProcessor::SetVocabulary( CHECK_OR_RETURN(type == TrainerSpec::UNIGRAM || type == TrainerSpec::BPE) << "Vocabulary constraint is only enabled in subword units."; - const std::set vocab(valid_vocab.begin(), valid_vocab.end()); + const std::set vocab(valid_vocab.begin(), + valid_vocab.end()); for (int i = 0; i < model_proto_->pieces_size(); ++i) { auto *piece = model_proto_->mutable_pieces(i); @@ -207,7 +214,7 @@ util::Status SentencePieceProcessor::LoadVocabulary(absl::string_view filename, } } - return SetVocabulary(vocab); + return SetVocabulary(ToPieceArray(vocab)); } #define CHECK_OR_RETURN_STATUS_STL(container) \ @@ -250,6 +257,12 @@ util::Status SentencePieceProcessor::Encode(absl::string_view input, util::Status SentencePieceProcessor::Decode( const std::vector &pieces, std::string *detokenized) const { + return Decode(ToPieceArray(pieces), detokenized); +} + +util::Status SentencePieceProcessor::Decode( + const std::vector &pieces, + std::string *detokenized) const { CHECK_OR_RETURN_STATUS_STL(detokenized); SentencePieceText spt; @@ -593,6 +606,12 @@ util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input, util::Status SentencePieceProcessor::Decode( const std::vector &pieces, SentencePieceText *spt) const { + return Decode(ToPieceArray(pieces), spt); +} + +util::Status SentencePieceProcessor::Decode( + const std::vector &pieces, + SentencePieceText *spt) const { CHECK_OR_RETURN_STATUS_PROTO(spt); const char *unk_surface = kDefaultUnknownSymbol; @@ -637,9 +656,9 @@ util::Status SentencePieceProcessor::Decode( has_bos_ws); }; - for (const std::string &w : pieces) { + for (absl::string_view w : pieces) { auto *sp = spt->add_pieces(); - sp->set_piece(w); + sp->mutable_piece()->assign(w.data(), w.size()); sp->set_id(PieceToId(w)); } @@ -779,6 +798,13 @@ std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( return spt.SerializeAsString(); } +std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( + const std::vector &pieces) const { + SentencePieceText spt; + if (!Decode(pieces, &spt).ok()) return ""; + return spt.SerializeAsString(); +} + std::string SentencePieceProcessor::DecodeIdsAsSerializedProto( const std::vector &ids) const { SentencePieceText spt; diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h index 9d38214..8c72656 100644 --- a/src/sentencepiece_processor.h +++ b/src/sentencepiece_processor.h @@ -22,9 +22,11 @@ #include #include +#ifndef SWIG namespace absl { using std::string_view; } +#endif // SWIG namespace sentencepiece { @@ -58,8 +60,7 @@ class Status { public: Status(); ~Status(); - Status(StatusCode code, const char *error_message); - Status(StatusCode code, const std::string &error_message); + Status(StatusCode code, absl::string_view error_message); Status(const Status &s); void operator=(const Status &s); bool operator==(const Status &s) const; @@ -204,7 +205,7 @@ class SentencePieceProcessor { // Restricts the vocabulary set. // The input sentences are encoded into the tokens in `valid_vocab`. virtual util::Status SetVocabulary( - const std::vector &valid_vocab); + const std::vector &valid_vocab); // Reverts the vocabulary restriction. virtual util::Status ResetVocabulary(); @@ -230,6 +231,10 @@ class SentencePieceProcessor { virtual util::Status Decode(const std::vector &pieces, std::string *detokenized) const; + // Given a sequence of pieces, decodes it into a detokenized output. + virtual util::Status Decode(const std::vector &pieces, + std::string *detokenized) const; + // Given a sequence of ids, decodes it into a detokenized output. virtual util::Status Decode(const std::vector &ids, std::string *detokenized) const; @@ -320,16 +325,19 @@ class SentencePieceProcessor { absl::string_view input, int samples, float theta, bool wor, bool include_best, NBestSentencePieceText *samples_spt) const; -#ifndef SWIG // Calculate entropy of possible tokenisations virtual util::Status CalculateEntropy(absl::string_view input, float theta, float *entropy) const; -#endif // Given a sequence of pieces, decodes it into SentencePieceText. + // TODO(taku): Remove this API and use std::vector virtual util::Status Decode(const std::vector &pieces, SentencePieceText *spt) const; + // Given a sequence of pieces, decodes it into SentencePieceText. + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + // Given a sequence of ids, decodes it into SentencePieceText. virtual util::Status Decode(const std::vector &ids, SentencePieceText *spt) const; @@ -401,11 +409,17 @@ class SentencePieceProcessor { theta, wor, include_best); } + // TODO(taku): Remove this API and use std::vector virtual std::string DecodePieces( const std::vector &pieces) const { DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); } + virtual std::string DecodePieces( + const std::vector &pieces) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); + } + virtual std::string DecodeIds(const std::vector &ids) const { DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids); } @@ -428,9 +442,13 @@ class SentencePieceProcessor { virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input, int nbest_size) const; + // TODO(taku): Remove this API and use std::vector virtual util::bytes DecodePiecesAsSerializedProto( const std::vector &pieces) const; + virtual util::bytes DecodePiecesAsSerializedProto( + const std::vector &pieces) const; + virtual util::bytes DecodeIdsAsSerializedProto( const std::vector &ids) const; diff --git a/src/sentencepiece_trainer.h b/src/sentencepiece_trainer.h index bb74ab9..b4af6f0 100644 --- a/src/sentencepiece_trainer.h +++ b/src/sentencepiece_trainer.h @@ -129,12 +129,12 @@ class SentencePieceTrainer { // with comma-separated values. `field_name` must not be a nested message. // The body of these functions are automatically generated with // data/gen_spec_parser.pl - static util::Status SetProtoField(const std::string &name, - const std::string &value, + static util::Status SetProtoField(absl::string_view name, + absl::string_view value, TrainerSpec *message); - static util::Status SetProtoField(const std::string &name, - const std::string &value, + static util::Status SetProtoField(absl::string_view name, + absl::string_view value, NormalizerSpec *message); // Populates model type from string representation, e.g., "bpe". diff --git a/src/spec_parser.h b/src/spec_parser.h index b5713fb..de8f72f 100644 --- a/src/spec_parser.h +++ b/src/spec_parser.h @@ -25,10 +25,10 @@ namespace sentencepiece { -#define PARSE_STRING(param_name) \ - if (name == #param_name) { \ - message->set_##param_name(value); \ - return util::OkStatus(); \ +#define PARSE_STRING(param_name) \ + if (name == #param_name) { \ + message->set_##param_name(std::string(value)); \ + return util::OkStatus(); \ } #define PARSE_REPEATED_STRING(param_name) \ @@ -189,8 +189,8 @@ inline std::string PrintProto(const NormalizerSpec &message, return os.str(); } -util::Status SentencePieceTrainer::SetProtoField(const std::string &name, - const std::string &value, +util::Status SentencePieceTrainer::SetProtoField(absl::string_view name, + absl::string_view value, TrainerSpec *message) { CHECK_OR_RETURN(message); @@ -249,8 +249,8 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name, << "unknown field name \"" << name << "\" in TrainerSpec."; } -util::Status SentencePieceTrainer::SetProtoField(const std::string &name, - const std::string &value, +util::Status SentencePieceTrainer::SetProtoField(absl::string_view name, + absl::string_view value, NormalizerSpec *message) { CHECK_OR_RETURN(message); diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc index 4d12a38..b0e508d 100644 --- a/src/spm_encode_main.cc +++ b/src/spm_encode_main.cc @@ -92,13 +92,13 @@ int main(int argc, char *argv[]) { absl::flat_hash_map vocab; sentencepiece::SentencePieceText spt; sentencepiece::NBestSentencePieceText nbest_spt; - std::function process; + std::function process; const int nbest_size = absl::GetFlag(FLAGS_nbest_size); const float alpha = absl::GetFlag(FLAGS_alpha); if (absl::GetFlag(FLAGS_generate_vocabulary)) { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &spt)); for (const auto &piece : spt.pieces()) { if (!sp.IsUnknown(piece.id()) && !sp.IsControl(piece.id())) @@ -106,47 +106,47 @@ int main(int argc, char *argv[]) { } }; } else if (absl::GetFlag(FLAGS_output_format) == "piece") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &sps)); output->WriteLine(absl::StrJoin(sps, " ")); }; } else if (absl::GetFlag(FLAGS_output_format) == "id") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &ids)); output->WriteLine(absl::StrJoin(ids, " ")); }; } else if (absl::GetFlag(FLAGS_output_format) == "proto") { - process = [&](const std::string &line) { CHECK_OK(sp.Encode(line, &spt)); }; + process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &spt)); }; } else if (absl::GetFlag(FLAGS_output_format) == "sample_piece") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &sps)); output->WriteLine(absl::StrJoin(sps, " ")); }; } else if (absl::GetFlag(FLAGS_output_format) == "sample_id") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &ids)); output->WriteLine(absl::StrJoin(ids, " ")); }; } else if (absl::GetFlag(FLAGS_output_format) == "sample_proto") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &spt)); }; } else if (absl::GetFlag(FLAGS_output_format) == "nbest_piece") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_sps)); for (const auto &result : nbest_sps) { output->WriteLine(absl::StrJoin(result, " ")); } }; } else if (absl::GetFlag(FLAGS_output_format) == "nbest_id") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_ids)); for (const auto &result : nbest_ids) { output->WriteLine(absl::StrJoin(result, " ")); } }; } else if (absl::GetFlag(FLAGS_output_format) == "nbest_proto") { - process = [&](const std::string &line) { + process = [&](absl::string_view line) { CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_spt)); }; } else { diff --git a/src/util.cc b/src/util.cc index f99c73a..f54e8ba 100644 --- a/src/util.cc +++ b/src/util.cc @@ -244,15 +244,16 @@ std::vector StrSplitAsCSV(absl::string_view text) { #ifdef OS_WIN namespace win32 { -std::wstring Utf8ToWide(const std::string &input) { - int output_length = - ::MultiByteToWideChar(CP_UTF8, 0, input.c_str(), -1, nullptr, 0); +std::wstring Utf8ToWide(absl::string_view input) { + int output_length = ::MultiByteToWideChar( + CP_UTF8, 0, input.data(), static_cast(input.size()), nullptr, 0); output_length = output_length <= 0 ? 0 : output_length - 1; if (output_length == 0) { return L""; } std::unique_ptr input_wide(new wchar_t[output_length + 1]); - const int result = ::MultiByteToWideChar(CP_UTF8, 0, input.c_str(), -1, + const int result = ::MultiByteToWideChar(CP_UTF8, 0, input.data(), + static_cast(input.size()), input_wide.get(), output_length + 1); std::wstring output; if (result > 0) { @@ -260,24 +261,6 @@ std::wstring Utf8ToWide(const std::string &input) { } return output; } - -std::string WideToUtf8(const std::wstring &input) { - const int output_length = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, - nullptr, 0, nullptr, nullptr); - if (output_length == 0) { - return ""; - } - - std::unique_ptr input_encoded(new char[output_length + 1]); - const int result = - ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, input_encoded.get(), - output_length + 1, nullptr, nullptr); - std::string output; - if (result > 0) { - output.assign(input_encoded.get()); - } - return output; -} } // namespace win32 #endif } // namespace sentencepiece