inline void CheckIds(const std::vector<absl::string_view> &ids, int num_pieces) {}
+template <typename T>
+inline void ConvertToUnicodeSpans(T *proto) {}
+
+template <>
+inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) {
+ proto->ConvertToUnicodeSpans();
+}
+
+template <>
+inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) {
+ proto->ConvertToUnicodeSpans();
+}
+
class ThreadPool {
public:
explicit ThreadPool(size_t request_size) :
self->FuncName(ins[i]); \
RewriteIds(*self, &out, add_bos, add_eos, reverse, \
emit_unk_piece); \
+ ConvertToUnicodeSpans(&out); \
outs[i] = std::move(out); \
} \
}); \
pool.Schedule([&, n]() { \
for (size_t i = n; i < ins.size(); i += num_threads) { \
CheckIds(ins[i], self->GetPieceSize()); \
- outs[i] = self->FuncName(ins[i]); \
+ auto out = self->FuncName(ins[i]); \
+ ConvertToUnicodeSpans(&out); \
+ outs[i] = std::move(out); \
} \
}); \
} \
auto proto = enable_sampling ?
$self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) :
$self->EncodeAsImmutableProto(text);
+ proto.ConvertToUnicodeSpans();
RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
return proto;
}
sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto(
const std::vector<int> &ids) const {
CheckIds(ids, $self->GetPieceSize());
- return $self->DecodeIdsAsImmutableProto(ids);
+ auto proto = $self->DecodeIdsAsImmutableProto(ids);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto(
const std::vector<absl::string_view> &pieces) const {
CheckIds(pieces, $self->GetPieceSize());
- return $self->DecodePiecesAsImmutableProto(pieces);
+ auto proto= $self->DecodePiecesAsImmutableProto(pieces);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
/////////////////////////////////////////////////////////////////////////////
bool emit_unk_piece) const {
RewriteIds(*$self, static_cast<sentencepiece::ImmutableSentencePieceText *>(nullptr),
add_bos, add_eos, reverse, emit_unk_piece);
- return $self->NBestEncodeAsImmutableProto(text, nbest_size);
+ auto proto = $self->NBestEncodeAsImmutableProto(text, nbest_size);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
bool emit_unk_piece) const {
RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
add_bos, add_eos, reverse, emit_unk_piece);
- return $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
+ auto proto = $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
alpha, wor, include_best);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
inline void CheckIds(const std::vector<absl::string_view> &ids, int num_pieces) {}
+template <typename T>
+inline void ConvertToUnicodeSpans(T *proto) {}
+
+template <>
+inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) {
+ proto->ConvertToUnicodeSpans();
+}
+
+template <>
+inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) {
+ proto->ConvertToUnicodeSpans();
+}
+
class ThreadPool {
public:
explicit ThreadPool(size_t request_size) :
self->FuncName(ins[i]); \
RewriteIds(*self, &out, add_bos, add_eos, reverse, \
emit_unk_piece); \
+ ConvertToUnicodeSpans(&out); \
outs[i] = std::move(out); \
} \
}); \
pool.Schedule([&, n]() { \
for (size_t i = n; i < ins.size(); i += num_threads) { \
CheckIds(ins[i], self->GetPieceSize()); \
- outs[i] = self->FuncName(ins[i]); \
+ auto out = self->FuncName(ins[i]); \
+ ConvertToUnicodeSpans(&out); \
+ outs[i] = std::move(out); \
} \
}); \
} \
auto proto = enable_sampling ?
self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) :
self->EncodeAsImmutableProto(text);
+ proto.ConvertToUnicodeSpans();
RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
return proto;
}
}
SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){
CheckIds(ids, self->GetPieceSize());
- return self->DecodeIdsAsImmutableProto(ids);
+ auto proto = self->DecodeIdsAsImmutableProto(ids);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){
CheckIds(pieces, self->GetPieceSize());
- return self->DecodePiecesAsImmutableProto(pieces);
+ auto proto= self->DecodePiecesAsImmutableProto(pieces);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string);
SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){
RewriteIds(*self, static_cast<sentencepiece::ImmutableSentencePieceText *>(nullptr),
add_bos, add_eos, reverse, emit_unk_piece);
- return self->NBestEncodeAsImmutableProto(text, nbest_size);
+ auto proto = self->NBestEncodeAsImmutableProto(text, nbest_size);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){
auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples,
SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){
RewriteIds(*self, static_cast<sentencepiece::util::bytes *>(nullptr),
add_bos, add_eos, reverse, emit_unk_piece);
- return self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
+ auto proto = self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
alpha, wor, include_best);
+ proto.ConvertToUnicodeSpans();
+ return proto;
}
SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float alpha){
return self->CalculateEntropy(text, alpha);
}
void ConvertToUnicodeSpansInternal(SentencePieceText *spt) {
- if (spt == nullptr) return;
+ if (spt == nullptr || spt->text().empty()) return;
std::vector<int> utf8_to_unicode(spt->text().size() + 1, 0);
absl::string_view str = spt->text();
size_t prev = 0;
int ulen = 0;
while (!str.empty()) {
- const size_t mblen = string_util::OneCharLen(str.data());
+ const size_t mblen = std::max<int>(1, string_util::OneCharLen(str.data()));
for (int i = prev; i < prev + mblen; ++i) {
utf8_to_unicode[i] = ulen;
}
virtual util::Status Decode(const std::vector<int> &ids,
SentencePieceText *spt) const;
-
-#ifndef SWIGPYTHON
-
-#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \
- OutType output; \
- const auto status = FuncName(__VA_ARGS__, &output); \
- return output;
-
-#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \
- OutType output; \
- const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
- return output.SerializeAsString();
-
-#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \
- OutType output; \
- const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
- return output;
-
+#ifdef SWIG
+#define SPP_SWIG_CHECK_AND_THROW \
+ if (!status.ok()) throw status;
#else
+#define SPP_SWIG_CHECK_AND_THROW \
+ if (!status.ok()) { \
+ }
+#endif // SWIG
#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \
OutType output; \
const auto status = FuncName(__VA_ARGS__, &output); \
- if (!status.ok()) throw status; \
+ SPP_SWIG_CHECK_AND_THROW; \
return output;
#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \
OutType output; \
const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
- if (!status.ok()) throw status; \
+ SPP_SWIG_CHECK_AND_THROW; \
return output.SerializeAsString();
#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \
OutType output; \
const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
- if (!status.ok()) throw status; \
- output.ConvertToUnicodeSpans(); \
+ SPP_SWIG_CHECK_AND_THROW; \
return output;
-#endif // SWIGPYTHON
-
//////////////////////////////////////////////////////////////
// Handy methods that return the result directly.
// These functions ignore internal errors.