From 6993b50efd100477c0a0e2e3d931b2a408579a2b Mon Sep 17 00:00:00 2001 From: Kentaro Hayashi Date: Mon, 21 Nov 2022 13:43:46 +0000 Subject: [PATCH] Import sentencepiece_0.1.97-3.debian.tar.xz [dgit import tarball sentencepiece 0.1.97-3 sentencepiece_0.1.97-3.debian.tar.xz] --- README.Debian | 38 + changelog | 182 + control | 61 + copyright | 150 + gbp.conf | 3 + libsentencepiece-dev.install | 3 + libsentencepiece0.install | 1 + patches/0001-update-python-wrapper.patch | 5810 +++++++++++++++++ ...ove-debug-symbols-from-wheel-package.patch | 23 + ...ter-to-be-used-in-user_defined_symbo.patch | 71 + ...t-to-use-tab-as-user-defined-symbols.patch | 45 + patches/0005-Uses-C-17-by-default.patch | 915 +++ ...std-atomic-to-define-global-variable.patch | 73 + patches/0007-Fix-a-typo.patch | 30 + ...absl-string_view-as-much-as-possible.patch | 1595 +++++ patches/0009-Fixed-build-break.patch | 21 + ...0-Added-ImmutableSentencePiece-class.patch | 1648 +++++ patches/0011-add-verbose-option.patch | 172 + ...leSentencePieceText-from-python-modu.patch | 4346 ++++++++++++ patches/0013-Adds-more-unittests.patch | 1146 ++++ patches/0014-Adds-SWIGPYTHON-flag.patch | 44 + .../0015-remove-unused-ifdef-SWIG-macro.patch | 137 + patches/0016-Fixed-test-failure.patch | 281 + ...017-Uses-property-in-immutable-proto.patch | 878 +++ ...tect-the-number-of-CPUs-in-batch-pro.patch | 252 + ...pport-slice-in-pieces-nbests-objects.patch | 78 + patches/0020-Updated-the-document.patch | 546 ++ ...021-Fixed-errors-in-example-notebook.patch | 158 + patches/0022-Fix-dead-links.patch | 36 + ...brary-function-to-uninitialize-globa.patch | 176 + ...of-concatinating-paths-for-pkg-confi.patch | 153 + patches/disable-static-library.patch | 44 + patches/header-dependencies.patch | 27 + patches/series | 27 + patches/support-python-module-in-place.patch | 58 + python3-sentencepiece.install | 1 + rules | 41 + salsa-ci.yml | 7 + sentencepiece.docs | 1 + sentencepiece.install | 1 + sentencepiece.xml | 291 + source/format | 1 + watch | 4 + 43 files changed, 19575 insertions(+) create mode 100644 README.Debian create mode 100644 changelog create mode 100644 control create mode 100644 copyright create mode 100644 gbp.conf create mode 100644 libsentencepiece-dev.install create mode 100644 libsentencepiece0.install create mode 100644 patches/0001-update-python-wrapper.patch create mode 100644 patches/0002-remove-debug-symbols-from-wheel-package.patch create mode 100644 patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch create mode 100644 patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch create mode 100644 patches/0005-Uses-C-17-by-default.patch create mode 100644 patches/0006-Uses-std-atomic-to-define-global-variable.patch create mode 100644 patches/0007-Fix-a-typo.patch create mode 100644 patches/0008-Uses-absl-string_view-as-much-as-possible.patch create mode 100644 patches/0009-Fixed-build-break.patch create mode 100644 patches/0010-Added-ImmutableSentencePiece-class.patch create mode 100644 patches/0011-add-verbose-option.patch create mode 100644 patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch create mode 100644 patches/0013-Adds-more-unittests.patch create mode 100644 patches/0014-Adds-SWIGPYTHON-flag.patch create mode 100644 patches/0015-remove-unused-ifdef-SWIG-macro.patch create mode 100644 patches/0016-Fixed-test-failure.patch create mode 100644 patches/0017-Uses-property-in-immutable-proto.patch create mode 100644 patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch create mode 100644 patches/0019-support-slice-in-pieces-nbests-objects.patch create mode 100644 patches/0020-Updated-the-document.patch create mode 100644 patches/0021-Fixed-errors-in-example-notebook.patch create mode 100644 patches/0022-Fix-dead-links.patch create mode 100644 patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch create mode 100644 patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch create mode 100644 patches/disable-static-library.patch create mode 100644 patches/header-dependencies.patch create mode 100644 patches/series create mode 100644 patches/support-python-module-in-place.patch create mode 100644 python3-sentencepiece.install create mode 100755 rules create mode 100644 salsa-ci.yml create mode 100644 sentencepiece.docs create mode 100644 sentencepiece.install create mode 100644 sentencepiece.xml create mode 100644 source/format create mode 100644 watch diff --git a/README.Debian b/README.Debian new file mode 100644 index 0000000..147aaa5 --- /dev/null +++ b/README.Debian @@ -0,0 +1,38 @@ +# senencepiece for Debian + +The upstream of sentencepiece 0.1.97 was initially released around June 6, 2022, +but it was withdrawed and re-released as same version at Aug 7, 2022 again. + +Thus, some commits were not included into 0.1.97-1. + +To fix up this issue, commits since 5e5adf2f851a1514ccc435aae11ee830c438321b +were applied as the following patch files. + +NOTE: Drop these patch files when newer version was released. + +0001-update-python-wrapper.patch +0002-remove-debug-symbols-from-wheel-package.patch +0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch +0004-add-test-to-use-tab-as-user-defined-symbols.patch +0005-Uses-C-17-by-default.patch +0006-Uses-std-atomic-to-define-global-variable.patch +0007-Fix-a-typo.patch +0008-Uses-absl-string_view-as-much-as-possible.patch +0009-Fixed-build-break.patch +0010-Added-ImmutableSentencePiece-class.patch +0011-add-verbose-option.patch +0012-Supports-ImmutableSentencePieceText-from-python-modu.patch +0013-Adds-more-unittests.patch +0014-Adds-SWIGPYTHON-flag.patch +0015-remove-unused-ifdef-SWIG-macro.patch +0016-Fixed-test-failure.patch +0017-Uses-property-in-immutable-proto.patch +0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch +0019-support-slice-in-pieces-nbests-objects.patch +0020-Updated-the-document.patch +0021-Fixed-errors-in-example-notebook.patch +0022-Fix-dead-links.patch +0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch +0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch + + diff --git a/changelog b/changelog new file mode 100644 index 0000000..ca65a4d --- /dev/null +++ b/changelog @@ -0,0 +1,182 @@ +sentencepiece (0.1.97-3) unstable; urgency=medium + + * debian/patches/0001-update-python-wrapper.patch + debian/patches/0002-remove-debug-symbols-from-wheel-package.patch + debian/patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch + debian/patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch + debian/patches/0005-Uses-C-17-by-default.patch + debian/patches/0006-Uses-std-atomic-to-define-global-variable.patch + debian/patches/0007-Fix-a-typo.patch + debian/patches/0008-Uses-absl-string_view-as-much-as-possible.patch + debian/patches/0009-Fixed-build-break.patch + debian/patches/0010-Added-ImmutableSentencePiece-class.patch + debian/patches/0011-add-verbose-option.patch + debian/patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch + debian/patches/0013-Adds-more-unittests.patch + debian/patches/0014-Adds-SWIGPYTHON-flag.patch + debian/patches/0015-remove-unused-ifdef-SWIG-macro.patch + debian/patches/0016-Fixed-test-failure.patch + debian/patches/0017-Uses-property-in-immutable-proto.patch + debian/patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch + debian/patches/0019-support-slice-in-pieces-nbests-objects.patch + debian/patches/0020-Updated-the-document.patch + debian/patches/0021-Fixed-errors-in-example-notebook.patch + debian/patches/0022-Fix-dead-links.patch + debian/patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch + debian/patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch + - Add missing patch files for 0.1.97. + * debian/README.Debian + - Add explanation of debian/patches. + + -- Kentaro Hayashi Mon, 21 Nov 2022 22:43:46 +0900 + +sentencepiece (0.1.97-2) unstable; urgency=medium + + * Team upload + + [ Steve Langasek ] + * debian/patches/header-dependencies.patch: include necessary headers + to ensure IS_BIG_ENDIAN is defined, see #1017360. + + -- Graham Inggs Sun, 18 Sep 2022 05:30:57 +0000 + +sentencepiece (0.1.97-1) unstable; urgency=medium + + * New upstream version 0.1.97 + * debian/copyright + - Update maintainer E-mail address + * debian/control + - Bump Standards-Version to 4.6.1. No other changes are required. + * debian/patches/support-python-module-in-place.patch + - Refresh path to build python module. + + -- Kentaro Hayashi Tue, 14 Jun 2022 20:19:58 +0900 + +sentencepiece (0.1.96-1) unstable; urgency=medium + + * New upstream version 0.1.96 + * debian/control + - Bump standard-version to 4.5.1. No changes are required. + + -- Kentaro Hayashi Wed, 18 Aug 2021 20:52:46 +0900 + +sentencepiece (0.1.95-1) unstable; urgency=medium + + * New upstream version 0.1.95 + * debian/patches/support-python-module-in-place.patch + - Fix undefined symbol when importing python module (Closes: #979040) + + -- Kentaro Hayashi Thu, 11 Feb 2021 17:36:23 +0900 + +sentencepiece (0.1.94-2) unstable; urgency=medium + + * Fix FTBFS on armel/mipsel (Closes: #977235) + + -- Kentaro Hayashi Wed, 16 Dec 2020 21:18:15 +0900 + +sentencepiece (0.1.94-1) unstable; urgency=medium + + * New upstream version 0.1.94 + * debian/patches/support-python-module-in-place.patch + - Refresh path to build python module. + * debian/patches/fix-ftbfs-ports.patch + debian/patches/mutiarch-support.patch + - Remove needless patch because these patch was merged + to google/sentencepiece. + + -- Kentaro Hayashi Wed, 28 Oct 2020 21:02:07 +0900 + +sentencepiece (0.1.93-1) unstable; urgency=medium + + * New upstream version 0.1.93 + * debian/source/lintian-overrides + - Remove needless override. + + -- Kentaro Hayashi Thu, 15 Oct 2020 21:32:05 +0900 + +sentencepiece (0.1.92-3) unstable; urgency=medium + + * debian/patches/fix-ftbfs-ports.patch + - Fix FTBFS on powerpc + + -- Kentaro Hayashi Sat, 03 Oct 2020 20:48:27 +0900 + +sentencepiece (0.1.92-2) unstable; urgency=medium + + * debian/patches/0002-Change-in-order-to-build-Python-modules-in-place.patch + - Fix FTBFS on hurd-i386 + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Fix missing dependency to atomic library (powerpc,m68k,sh4) + + -- Kentaro Hayashi Sat, 26 Sep 2020 20:27:17 +0900 + +sentencepiece (0.1.92-1) unstable; urgency=medium + + * New upstream version 0.1.92 + + -- Kentaro Hayashi Fri, 19 Jun 2020 19:38:49 +0900 + +sentencepiece (0.1.91-1) unstable; urgency=medium + + * New upstream version 0.1.91 + + -- Kentaro Hayashi Fri, 22 May 2020 15:17:42 +0900 + +sentencepiece (0.1.90-3) unstable; urgency=medium + + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Refresh patch to fix FTBFS. + + -- Kentaro Hayashi Sun, 17 May 2020 09:02:23 +0900 + +sentencepiece (0.1.90-2) unstable; urgency=medium + + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Add patch to fix FTBFS on mipsel and armel + + -- Kentaro Hayashi Sat, 16 May 2020 16:16:45 +0900 + +sentencepiece (0.1.90-1) unstable; urgency=medium + + * New upstream version 0.1.90 + * debian/control + - Update Uploaders: + - Bump standard-version to 4.5.0 + - Bump compat version to 13. + * debian/source/lintian-overrides + - Fix false positive source-is-missing + * debian/patches/0003-Disable-static-library-explicitly.patch + - Disable to build static library + + -- Kentaro Hayashi Wed, 13 May 2020 19:09:34 +0900 + +sentencepiece (0.1.84-1) unstable; urgency=medium + + * New upstream version 0.1.84 (Closes: #939860) + + [ TSUCHIYA Masatoshi ] + * Initial packaging tasks. + * Remove pipeline configurations for BitBucket. + + [ Kentaro Hayashi ] + * debian/gbp.conf + - Add basic configuration about debian-branch + * debian/watch + - Add missing watch file to detect a new release + * debian/control + - Update deprecated Priority: to optional + - Add Vcs-* fields + - Fix W: sentencepiece: description-synopsis-starts-with-article + - Bump standard version to 4.4.1 + - Update Vcs-* under science-team + - Bump up compatibility level + - Drop python2 support + * debian/copyright + - Use https:// + - Update copyright about third party modules + * debian/rules + - Enable hardening + * debian/salsa-ci.yml + - Add Salsa CI configuration + + -- Kentaro Hayashi Thu, 17 Oct 2019 13:33:34 +0900 diff --git a/control b/control new file mode 100644 index 0000000..69e3062 --- /dev/null +++ b/control @@ -0,0 +1,61 @@ +Source: sentencepiece +Section: science +Priority: optional +Maintainer: Debian Science Maintainers +Uploaders: + TSUCHIYA Masatoshi , + Kentaro Hayashi +Build-Depends: + debhelper-compat (= 13), + protobuf-compiler, + libprotobuf-dev, + dh-python, + python3-all-dev, + quilt, + cmake, + python3-setuptools +Standards-Version: 4.6.1 +Homepage: https://github.com/google/sentencepiece +Vcs-Browser: https://salsa.debian.org/science-team/sentencepiece +Vcs-Git: https://salsa.debian.org/science-team/sentencepiece.git +Rules-Requires-Root: no + +Package: sentencepiece +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Unsupervised text tokenizer and detokenizer + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: libsentencepiece0 +Section: libs +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Library files of SentencePiece + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: libsentencepiece-dev +Section: libdevel +Architecture: any +Depends: libsentencepiece0 (= ${binary:Version}), ${misc:Depends} +Description: Header files of SentencePiece + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: python3-sentencepiece +Section: python +Architecture: any +Depends: + ${shlibs:Depends}, + ${misc:Depends}, + ${python3:Depends} +Description: SentencePiece binding for Python3 + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + . + python3-sentencepiece is its binding for Python3. diff --git a/copyright b/copyright new file mode 100644 index 0000000..17b9239 --- /dev/null +++ b/copyright @@ -0,0 +1,150 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: sentencepiece +Source: https://github.com/google/sentencepiece + +Files: * +Copyright: 2017 Taku Kudo +License: Apache-2.0 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied. See the License for the specific language governing + permissions and limitations under the License. + +Files: debian/* +Copyright: + 2016 TSUCHIYA Masatoshi + 2019-2022 Kentaro Hayashi +License: GPL-2+ + This package is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + . + This package is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see + . + On Debian systems, the complete text of the GNU General + Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". + +Files: third_party/esaxx/* +Copyright: 2010 Daisuke Okanohara +License: MIT + +Files: third_party/darts_clone/* +Copyright: 2008-2011, Susumu Yata +License: BSD-3-clause + +Files: third_party/protobuf-lite/* +Copyright: 2008 Google Inc. +License: BSD-3-clause + +Files: data/Scripts.txt +Copyright: 1991-2016 Unicode, Inc. +License: Unicode + COPYRIGHT AND PERMISSION NOTICE + . + Copyright © 1991-2016 Unicode, Inc. All rights reserved. + Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + . + Permission is hereby granted, free of charge, to any person obtaining + a copy of the Unicode data files and any associated documentation + (the "Data Files") or Unicode software and any associated documentation + (the "Software") to deal in the Data Files or Software + without restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, and/or sell copies of + the Data Files or Software, and to permit persons to whom the Data Files + or Software are furnished to do so, provided that either + (a) this copyright and permission notice appear with all copies + of the Data Files or Software, or + (b) this copyright and permission notice appear in associated + Documentation. + . + THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + PERFORMANCE OF THE DATA FILES OR SOFTWARE. + . + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, + use or other dealings in these Data Files or Software without prior + written authorization of the copyright holder. + +Files: data/botchan.txt +Copyright: Kin-nosuke Natsume +License: public-domain + Written by Kin-nosuke Natume and put into the public domain. + It's transalted by Yasotaro Morri and published by Project Gutenberg. + +Files: data/wagahaiwa_nekodearu.txt +Copyright: Kin-nosuke Natsume +License: public-domain + Written by Kin-nosuke Natume and put into the public domain. + It's digitized by Aozora Bunko collabolator and published by Aozora Bunko. + +License: MIT + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + . + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + +License: BSD-3-clause + Redistribution and use in source and binary forms, with or without + modificatio n, are permitted provided that the following conditions + are met: + . + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + - Neither the name of the nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/gbp.conf b/gbp.conf new file mode 100644 index 0000000..7c93e18 --- /dev/null +++ b/gbp.conf @@ -0,0 +1,3 @@ +[DEFAULT] +debian-branch = master + diff --git a/libsentencepiece-dev.install b/libsentencepiece-dev.install new file mode 100644 index 0000000..b363748 --- /dev/null +++ b/libsentencepiece-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/lib*.so +usr/lib/*/pkgconfig/* +usr/include/* diff --git a/libsentencepiece0.install b/libsentencepiece0.install new file mode 100644 index 0000000..3ddde58 --- /dev/null +++ b/libsentencepiece0.install @@ -0,0 +1 @@ +usr/lib/*/lib*.so.* diff --git a/patches/0001-update-python-wrapper.patch b/patches/0001-update-python-wrapper.patch new file mode 100644 index 0000000..07f2254 --- /dev/null +++ b/patches/0001-update-python-wrapper.patch @@ -0,0 +1,5810 @@ +From: Taku Kudo +Date: Wed, 8 Jun 2022 02:22:21 +0900 +Subject: update python wrapper. + +Signed-off-by: Kentaro Hayashi +--- + python/make_py_wheel.sh | 73 - + python/make_py_wheel_mac.sh | 89 - + python/once.h | 157 -- + python/src/sentencepiece/__init__.py | 293 ++- + python/src/sentencepiece/sentencepiece.i | 648 +++++- + python/src/sentencepiece/sentencepiece_wrap.cxx | 2383 +++++++++++++++-------- + python/test/sentencepiece_test.py | 424 ++-- + 7 files changed, 2575 insertions(+), 1492 deletions(-) + delete mode 100755 python/make_py_wheel.sh + delete mode 100755 python/make_py_wheel_mac.sh + delete mode 100644 python/once.h + +diff --git a/python/make_py_wheel.sh b/python/make_py_wheel.sh +deleted file mode 100755 +index 2e123ce..0000000 +--- a/python/make_py_wheel.sh ++++ /dev/null +@@ -1,73 +0,0 @@ +-#!/bin/bash +-# Copyright 2018 Google Inc. +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License.! +-set -e # exit immediately on error +-set -x # display all commands +- +-CMAKE_VERSION=3.12.0 +- +-run_docker() { +- cd `dirname $0` +- docker pull $1 +- docker run --rm -ti --name py_sentencepiece \ +- -v `pwd`/../:/sentencepiece -w /sentencepiece/python \ +- -td $1 /bin/bash +- docker exec py_sentencepiece bash -c "./make_py_wheel.sh native $2" +- docker stop py_sentencepiece +-} +- +-build() { +- TRG=$1 +- rm -fr build +- mkdir -p build +- cd build +- +- # Install sentencepiece +- cmake ../.. -DSPM_ENABLE_SHARED=OFF +- make -j4 +- make install +- cd .. +- +- for i in /opt/python/* +- do +- export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib +- $i/bin/python setup.py clean +- $i/bin/python setup.py bdist +- strip build/*/*/*.so +- $i/bin/python setup.py bdist_wheel +- $i/bin/python setup.py test +- rm -fr build +- rm -fr *.so +- done +- +- cd dist +- for i in *${TRG}.whl +- do +- auditwheel repair $i +- done +- +- mv -f wheelhouse/*${TRG}.whl . +- +- cd .. +- rm -fr build +-} +- +-if [ "$1" = "native" ]; then +- build $2 +-elif [ "$#" -eq 1 ]; then +- run_docker quay.io/pypa/manylinux2014_${1} ${1} +-else +- run_docker quay.io/pypa/manylinux2014_i686 i686 +- run_docker quay.io/pypa/manylinux2014_x86_64 x86_64 +-fi +diff --git a/python/make_py_wheel_mac.sh b/python/make_py_wheel_mac.sh +deleted file mode 100755 +index bed7366..0000000 +--- a/python/make_py_wheel_mac.sh ++++ /dev/null +@@ -1,89 +0,0 @@ +-#!/bin/bash +-# Copyright 2018 Google Inc. +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License.! +- +-set -e # exit immediately on error +-set -x # display all commands +- +-build_python() { +- VERSION=$1 +- URL=$2 +- INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin" +- CURRENT_PATH=${PATH} +- +- curl -L -o python.pkg ${URL} +- sudo installer -pkg python.pkg -target / +- +- if [ -f "${INSTALL_PATH}/python3" ]; then +- ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python +- ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config +- ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip +- fi +- +- export PATH="${INSTALL_PATH}:${CURRENT_PATH}" +- ls -l ${INSTALL_PATH} +- which python +- which pip +- python --version +- curl -L -o get-pip.py https://bootstrap.pypa.io/pip/3.6/get-pip.py +- sudo python ./get-pip.py --no-setuptools --no-wheel --ignore-installed +- pip install --upgrade setuptools +- pip install wheel +- pip install delocate +- python setup.py clean +- python setup.py bdist_wheel --plat-name=macosx_10_6_x86_64 +- python setup.py test +- delocate-listdeps dist/*.whl +- delocate-wheel -w dist/delocated_wheel dist/*.whl +- export PATH="${CURRENT_PATH}" +- +- ls -l dist/delocated_wheel +- rm -fr build +- rm -fr *.so +- rm -fr dist/*.whl +- rm -fr python.pkg +-} +- +-build() { +- cd python +- rm -fr build +- mkdir -p build +- cd build +- +- # Install sentencepiece +- cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_NO_THREADLOCAL=ON +- make -j4 VERBOSE=1 +- make install +- cd .. +- +- mkdir -p dist/delocated_wheel +- +-# build_python 2.7 https://www.python.org/ftp/python/2.7.15/python-2.7.15-macosx10.6.pkg +-# latest pip doesn't support Py3.4 +- # build_python 3.4 https://www.python.org/ftp/python/3.4.4/python-3.4.4-macosx10.6.pkg +- curl -L -O https://bootstrap.pypa.io/pip/3.5/get-pip.py +- build_python 3.5 https://www.python.org/ftp/python/3.5.4/python-3.5.4-macosx10.6.pkg +- +- curl -L -O https://bootstrap.pypa.io/get-pip.py +- build_python 3.6 https://www.python.org/ftp/python/3.6.6/python-3.6.6-macosx10.6.pkg +- build_python 3.7 https://www.python.org/ftp/python/3.7.9/python-3.7.9-macosx10.9.pkg +- build_python 3.8 https://www.python.org/ftp/python/3.8.6/python-3.8.6-macosx10.9.pkg +- build_python 3.9 https://www.python.org/ftp/python/3.9.0/python-3.9.0-macosx10.9.pkg +- +- cd .. +- +- rm -fr build +-} +- +-build +diff --git a/python/once.h b/python/once.h +deleted file mode 100644 +index fc7553a..0000000 +--- a/python/once.h ++++ /dev/null +@@ -1,157 +0,0 @@ +-// Protocol Buffers - Google's data interchange format +-// Copyright 2008 Google Inc. All rights reserved. +-// https://developers.google.com/protocol-buffers/ +-// +-// Redistribution and use in source and binary forms, with or without +-// modification, are permitted provided that the following conditions are +-// met: +-// +-// * Redistributions of source code must retain the above copyright +-// notice, this list of conditions and the following disclaimer. +-// * Redistributions in binary form must reproduce the above +-// copyright notice, this list of conditions and the following disclaimer +-// in the documentation and/or other materials provided with the +-// distribution. +-// * Neither the name of Google Inc. nor the names of its +-// contributors may be used to endorse or promote products derived from +-// this software without specific prior written permission. +-// +-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +- +-// Author: kenton@google.com (Kenton Varda) +-// +-// emulates google3/base/once.h +-// +-// This header is intended to be included only by internal .cc files and +-// generated .pb.cc files. Users should not use this directly. +-// +-// This is basically a portable version of pthread_once(). +-// +-// This header declares: +-// * A type called ProtobufOnceType. +-// * A macro GOOGLE_PROTOBUF_DECLARE_ONCE() which declares a variable of type +-// ProtobufOnceType. This is the only legal way to declare such a variable. +-// The macro may only be used at the global scope (you cannot create local or +-// class member variables of this type). +-// * A function GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()). +-// This function, when invoked multiple times given the same ProtobufOnceType +-// object, will invoke init_func on the first call only, and will make sure +-// none of the calls return before that first call to init_func has finished. +-// * The user can provide a parameter which GoogleOnceInit() forwards to the +-// user-provided function when it is called. Usage example: +-// int a = 10; +-// GoogleOnceInit(&my_once, &MyFunctionExpectingIntArgument, &a); +-// * This implementation guarantees that ProtobufOnceType is a POD (i.e. no +-// static initializer generated). +-// +-// This implements a way to perform lazy initialization. It's more efficient +-// than using mutexes as no lock is needed if initialization has already +-// happened. +-// +-// Example usage: +-// void Init(); +-// GOOGLE_PROTOBUF_DECLARE_ONCE(once_init); +-// +-// // Calls Init() exactly once. +-// void InitOnce() { +-// GoogleOnceInit(&once_init, &Init); +-// } +-// +-// Note that if GoogleOnceInit() is called before main() has begun, it must +-// only be called by the thread that will eventually call main() -- that is, +-// the thread that performs dynamic initialization. In general this is a safe +-// assumption since people don't usually construct threads before main() starts, +-// but it is technically not guaranteed. Unfortunately, Win32 provides no way +-// whatsoever to statically-initialize its synchronization primitives, so our +-// only choice is to assume that dynamic initialization is single-threaded. +- +-#ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__ +-#define GOOGLE_PROTOBUF_STUBS_ONCE_H__ +- +-#include +-#include +-#include +-#include +- +-namespace google { +-namespace protobuf { +-namespace internal { +- +-using once_flag = std::atomic; +- +-template +-void my_call_once(once_flag& once, Callable&& fn, Args&&... args) { +- enum CallOnceState { +- ONCE_INIT = 0, +- ONCE_RUNNING = 1, +- ONCE_DONE = 2, +- }; +- +- int expected_state = ONCE_INIT; +- if (once.compare_exchange_strong(expected_state, ONCE_RUNNING)) { +- fn(std::forward(args)...); +- once.store(ONCE_DONE); +- return; +- } +- +- if (expected_state == ONCE_DONE) { +- return; +- } +- +- while (once.load() == ONCE_RUNNING) { +- sched_yield(); +- } +-} +- +-template +-void call_once(Args&&... args) { +- my_call_once(std::forward(args)...); +-} +-} // namespace internal +- +-// TODO(gerbens) remove this once third_party is fully extracted +-using ProtobufOnceType = internal::once_flag; +- +-inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()) { +- internal::my_call_once(*once, init_func); +-} +- +-template +-inline void GoogleOnceInitArg(ProtobufOnceType* once, void (*init_func)(Arg*), +- Arg* arg) { +- internal::my_call_once(*once, init_func, arg); +-} +- +-class GoogleOnceDynamic { +- public: +- // If this->Init() has not been called before by any thread, +- // execute (*func_with_arg)(arg) then return. +- // Otherwise, wait until that prior invocation has finished +- // executing its function, then return. +- template +- void Init(void (*func_with_arg)(T*), T* arg) { +- GoogleOnceInitArg(&this->state_, func_with_arg, arg); +- } +- +- private: +- ProtobufOnceType state_; +-}; +- +-#define GOOGLE_PROTOBUF_ONCE_TYPE ::google::protobuf::ProtobufOnceType +-#define GOOGLE_PROTOBUF_DECLARE_ONCE(NAME) \ +- ::google::protobuf::ProtobufOnceType NAME +- +-} // namespace protobuf +-} // namespace google +- +-#endif // GOOGLE_PROTOBUF_STUBS_ONCE_H__ +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index fdb5976..cba3b70 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -87,48 +87,15 @@ class SentencePieceProcessor(object): + def LoadVocabulary(self, filename, threshold): + return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold) + +- def EncodeAsPieces(self, input): +- return _sentencepiece.SentencePieceProcessor_EncodeAsPieces(self, input) +- +- def EncodeAsIds(self, input): +- return _sentencepiece.SentencePieceProcessor_EncodeAsIds(self, input) +- +- def NBestEncodeAsPieces(self, input, nbest_size): +- return _sentencepiece.SentencePieceProcessor_NBestEncodeAsPieces(self, input, nbest_size) +- +- def NBestEncodeAsIds(self, input, nbest_size): +- return _sentencepiece.SentencePieceProcessor_NBestEncodeAsIds(self, input, nbest_size) +- +- def SampleEncodeAsPieces(self, input, nbest_size, alpha): +- return _sentencepiece.SentencePieceProcessor_SampleEncodeAsPieces(self, input, nbest_size, alpha) +- +- def SampleEncodeAsIds(self, input, nbest_size, alpha): +- return _sentencepiece.SentencePieceProcessor_SampleEncodeAsIds(self, input, nbest_size, alpha) +- + def SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best): + return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best) + + def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): + return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) + +- def DecodePieces(self, pieces): +- return _sentencepiece.SentencePieceProcessor_DecodePieces(self, pieces) +- + def CalculateEntropy(self, text, theta): + return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, text, theta) + +- def EncodeAsSerializedProto(self, input): +- return _sentencepiece.SentencePieceProcessor_EncodeAsSerializedProto(self, input) +- +- def SampleEncodeAsSerializedProto(self, input, nbest_size, alpha): +- return _sentencepiece.SentencePieceProcessor_SampleEncodeAsSerializedProto(self, input, nbest_size, alpha) +- +- def NBestEncodeAsSerializedProto(self, input, nbest_size): +- return _sentencepiece.SentencePieceProcessor_NBestEncodeAsSerializedProto(self, input, nbest_size) +- +- def DecodePiecesAsSerializedProto(self, pieces): +- return _sentencepiece.SentencePieceProcessor_DecodePiecesAsSerializedProto(self, pieces) +- + def GetPieceSize(self): + return _sentencepiece.SentencePieceProcessor_GetPieceSize(self) + +@@ -171,30 +138,69 @@ class SentencePieceProcessor(object): + def LoadFromFile(self, arg): + return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) + +- def DecodeIdsWithCheck(self, ids): +- return _sentencepiece.SentencePieceProcessor_DecodeIdsWithCheck(self, ids) ++ def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + +- def DecodeIdsAsSerializedProtoWithCheck(self, ids): +- return _sentencepiece.SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(self, ids) ++ def _DecodeIds(self, ids): ++ return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids) + +- def _EncodeAsIds(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse): +- return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse) ++ def _DecodePieces(self, pieces): ++ return _sentencepiece.SentencePieceProcessor__DecodePieces(self, pieces) + +- def _EncodeAsPieces(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): +- return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ def _DecodeIdsAsSerializedProto(self, ids): ++ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids) + +- def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse): +- return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse) ++ def _DecodePiecesAsSerializedProto(self, pieces): ++ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) ++ ++ def _DecodeIdsBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) ++ ++ def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads) ++ ++ def _DecodePiecesBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads) ++ ++ def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) ++ ++ def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + + def _NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + +- def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse): +- return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse) ++ def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + + def _SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + ++ def _CalculateEntropy(self, text, theta): ++ return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, theta) ++ ++ def _CalculateEntropyBatch(self, ins, theta, num_threads): ++ return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, theta, num_threads) ++ + def Init(self, + model_file=None, + model_proto=None, +@@ -205,7 +211,8 @@ class SentencePieceProcessor(object): + emit_unk_piece=False, + enable_sampling=False, + nbest_size=-1, +- alpha=0.1): ++ alpha=0.1, ++ num_threads=1): + """Initialzie sentencepieceProcessor. + + Args: +@@ -225,6 +232,7 @@ class SentencePieceProcessor(object): + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and dropout probability of + merge operations for BPE-dropout. ++ num_threads: number of threads in batch processing. + """ + + _sentencepiece_processor_init_native(self) +@@ -236,6 +244,7 @@ class SentencePieceProcessor(object): + self._enable_sampling = enable_sampling + self._nbest_size = nbest_size + self._alpha = alpha ++ self._num_threads = num_threads + if model_file or model_proto: + self.Load(model_file=model_file, model_proto=model_proto) + +@@ -249,7 +258,8 @@ class SentencePieceProcessor(object): + emit_unk_piece=None, + enable_sampling=None, + nbest_size=None, +- alpha=None): ++ alpha=None, ++ num_threads=None): + """Encode text input to segmented ids or tokens. + + Args: +@@ -268,6 +278,7 @@ class SentencePieceProcessor(object): + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and merge probability for + BPE-dropout (probablity 'p' in BPE-dropout paper). ++ num_threads: the number of threads used in the batch processin (Default = 1). + """ + + if out_type is None: +@@ -286,6 +297,8 @@ class SentencePieceProcessor(object): + nbest_size = self._nbest_size + if alpha is None: + alpha = self._alpha ++ if num_threads is None: ++ num_threads = self._num_threads + + if enable_sampling == True and (nbest_size is None or nbest_size == 0 or + nbest_size == 1 or alpha is None): +@@ -296,18 +309,59 @@ class SentencePieceProcessor(object): + 'instead of nbest segmentations.' + ) + +- def _encode(text): +- if out_type is int: +- return self._EncodeAsIds(text, enable_sampling, nbest_size, +- alpha, add_bos, add_eos, reverse) +- else: +- return self._EncodeAsPieces(text, enable_sampling, nbest_size, +- alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') + + if type(input) is list: +- return [_encode(n) for n in input] ++ if out_type is int: ++ return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ if out_type is int: ++ return self._EncodeAsIds(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._EncodeAsPieces(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ raise RuntimeError('unknown out_type={}'.format(out_type)) ++ return None + +- return _encode(input) ++ ++ def EncodeAsPieces(self, input, **kwargs): ++ return self.Encode(input=input, out_type=str, **kwargs) ++ ++ ++ def EncodeAsIds(self, input, **kwargs): ++ return self.Encode(input=input, out_type=int, **kwargs) ++ ++ ++ def EncodeAsSerializedProto(self, input, **kwargs): ++ return self.Encode(input=input, out_type='proto', **kwargs) ++ ++ ++ def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type=str, enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type=int, enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type='proto', enable_sampling=True, **kwargs) + + + def NBestEncode(self, +@@ -348,9 +402,14 @@ class SentencePieceProcessor(object): + + def _encode(text): + if out_type is int: +- return self._NBestEncodeAsIds(text, nbest_size, add_bos, add_eos, reverse) +- else: +- return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) ++ return self._NBestEncodeAsIds(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._NBestEncodeAsPieces(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._NBestEncodeAsSerializedProto(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: + return [_encode(n) for n in input] +@@ -358,6 +417,21 @@ class SentencePieceProcessor(object): + return _encode(input) + + ++ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type=str, **kwargs) ++ ++ ++ def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type=int, **kwargs) ++ ++ ++ def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type='proto', **kwargs) ++ ++ + def SampleEncodeAndScore(self, + input, + out_type=None, +@@ -373,7 +447,7 @@ class SentencePieceProcessor(object): + + Args: + input: input string. accepsts list of string. +- out_type: output type. int or str. ++ out_type: output type. int or str or 'proto'. + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) +@@ -413,7 +487,7 @@ class SentencePieceProcessor(object): + def _encode(text): + if out_type is int: + return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, +- add_bos, add_eos, reverse) ++ add_bos, add_eos, reverse, emit_unk_piece) + else: + return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) +@@ -424,35 +498,90 @@ class SentencePieceProcessor(object): + return _encode(input) + + +- def Decode(self, input): +- """Decode processed id or token sequences.""" ++ def Decode(self, input, out_type=str, num_threads=None): ++ """Decode processed id or token sequences. ++ ++ Args: ++ out_type: output type. str or 'proto' (Default = str) ++ num_threads: the number of threads used in the batch processin (Default = 1). ++ """ ++ ++ if num_threads is None: ++ num_threads = self._num_threads ++ ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') + + if not input: +- return self.DecodeIds([]) +- elif type(input) is int: +- return self.DecodeIdsWithCheck([input]) +- elif type(input) is str: +- return self.DecodePieces([input]) ++ return '' ++ ++ if out_type is str: ++ if type(input) is int: ++ return self._DecodeIds([input]) ++ if type(input) is str: ++ return self._DecodePieces([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIds(input) ++ if type(input[0]) is str: ++ return self._DecodePieces(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesBatch(input, num_threads) ++ ++ if out_type == 'proto': ++ if type(input) is int: ++ return self._DecodeIdsAsSerializedProto([input]) ++ if type(input) is str: ++ return self._DecodePiecesAsSerializedProto([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIdsAsSerializedProto(input) ++ if type(input[0]) is str: ++ return self._DecodePiecesAsSerializedProto(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsAsSerializedProtoBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) ++ ++ ++ raise RuntimeError('unknown output or input type') ++ return None + +- def _decode(input): +- if not input: +- return self.DecodeIds([]) +- if type(input[0]) is int: +- return self.DecodeIdsWithCheck(input) +- return self.DecodePieces(input) + +- if type(input[0]) is list: +- return [_decode(n) for n in input] ++ def DecodePieces(self, input, out_type=str, **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) + +- return _decode(input) + ++ def DecodeIds(self, input, out_type=str, **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) + +- def Entropy(self, input, theta): +- """Calculate sentence entropy""" + ++ def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def CalculateEntropy(self, input, theta, num_threads=None): ++ """Calculate sentence entropy""" + if type(input) is list: +- return [self.CalculateEntropy(n, theta) for n in input] +- return self.CalculateEntropy(input, theta) ++ if num_threads is None: ++ num_threads = self._num_threads ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') ++ return self._CalculateEntropyBatch(input, theta, num_threads) ++ ++ return self._CalculateEntropy(input, theta) + + + def piece_size(self): +@@ -642,8 +771,6 @@ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init) + + SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode + SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode +-SentencePieceProcessor.DecodeIds = SentencePieceProcessor.DecodeIdsWithCheck +-SentencePieceProcessor.DecodeIdsAsSerializedProto = SentencePieceProcessor.DecodeIdsAsSerializedProtoWithCheck + + for m in [ + 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused', +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 21bb7cf..3a822bc 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -2,9 +2,13 @@ + %include exception.i + + %{ ++#include + #include ++#include + #include + #include ++#include ++#include + #include + #include + +@@ -12,6 +16,8 @@ namespace { + PyObject* kUnicodeInput = reinterpret_cast(0x1); + PyObject* kByteInput = reinterpret_cast(0x2); + ++using BytesArray = std::vector; ++ + inline void ReleaseResultObject(PyObject *obj) { + if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) { + Py_XDECREF(obj); +@@ -54,7 +60,7 @@ PyObject* MakePyOutputString(const std::string& output, + return PyBytes_FromStringAndSize(output.data(), output.size()); + } + +-PyObject* MakePyOutputBytes(const std::string& output) { ++PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) { + return PyBytes_FromStringAndSize(output.data(), output.size()); + } + +@@ -126,18 +132,18 @@ class PySentenceIterator : public sentencepiece::SentenceIterator { + sentencepiece::util::Status status_; + }; + +-void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, +- std::vector *ids, +- bool add_bos, bool add_eos, bool reverse) { ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ std::vector *ids, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (!add_bos && !add_eos && !reverse) return; + if (reverse) std::reverse(ids->begin(), ids->end()); + if (add_bos) ids->insert(ids->begin(), sp.bos_id()); + if (add_eos) ids->push_back(sp.eos_id()); + } + +-void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, +- std::vector *pieces, +- bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ std::vector *pieces, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return; + if (reverse) std::reverse(pieces->begin(), pieces->end()); + if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id())); +@@ -152,6 +158,98 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + } + } + } ++ ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ sentencepiece::util::bytes *proto, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++ if (add_bos || add_eos || reverse || emit_unk_piece) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kUnimplemented, ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); ++ } ++} ++ ++inline void CheckIds(const std::vector &ids, int num_pieces) { ++ for (int id : ids) { ++ if (id < 0 || id >= num_pieces) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "piece id is out of range."); ++ } ++ } ++} ++ ++inline void CheckIds(const std::vector &ids, int num_pieces) {} ++ ++class ThreadPool { ++ public: ++ explicit ThreadPool(size_t request_size) : ++ request_size_(request_size) {} ++ ++ virtual ~ThreadPool() { ++ for (auto &task : tasks_) { ++ task.join(); ++ } ++ } ++ ++ void Schedule(std::function closure) { ++ static constexpr size_t kMinThreadSize = 2; ++ if (request_size_ < kMinThreadSize) { ++ closure(); ++ } else { ++ tasks_.emplace_back(closure); ++ } ++ } ++ ++ private: ++ size_t request_size_ = 0; ++ std::vector tasks_; ++}; ++ ++template ++inline void InitNumThreads(const std::vector &ins, int *num_threads) { ++ *num_threads = std::max(1, ++ std::min({*num_threads, ++ static_cast(ins.size()), 256})); ++} ++ ++#define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ ++ std::vector outs(ins.size()); \ ++ InitNumThreads(ins, &num_threads); \ ++ { \ ++ ThreadPool pool(ins.size()); \ ++ for (int n = 0; n < num_threads; ++n) { \ ++ pool.Schedule([&, n]() { \ ++ for (size_t i = n; i < ins.size(); i += num_threads) { \ ++ auto out = enable_sampling ? \ ++ self->Sample##FuncName(ins[i], \ ++ nbest_size, alpha) : \ ++ self->FuncName(ins[i]); \ ++ RewriteIds(*self, &out, add_bos, add_eos, reverse, \ ++ emit_unk_piece); \ ++ outs[i] = std::move(out); \ ++ } \ ++ }); \ ++ } \ ++ } \ ++ return outs; ++ ++#define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ ++ std::vector outs(ins.size()); \ ++ InitNumThreads(ins, &num_threads); \ ++ { \ ++ ThreadPool pool(ins.size()); \ ++ for (int n = 0; n < num_threads; ++n) { \ ++ pool.Schedule([&, n]() { \ ++ for (size_t i = n; i < ins.size(); i += num_threads) { \ ++ CheckIds(ins[i], self->GetPieceSize()); \ ++ outs[i] = self->FuncName(ins[i]); \ ++ } \ ++ }); \ ++ } \ ++ } \ ++ return outs; ++ + } // namespace + %} + +@@ -171,15 +269,28 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + %ignore sentencepiece::SentencePieceText; + %ignore sentencepiece::NormalizerSpec; + %ignore sentencepiece::TrainerSpec; +- + %ignore sentencepiece::SentencePieceProcessor::status; ++ + %ignore sentencepiece::SentencePieceProcessor::Encode; ++%ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces; ++%ignore sentencepiece::SentencePieceProcessor::EncodeAsIds; ++%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::SampleEncode; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::NBestEncode; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; ++ + %ignore sentencepiece::SentencePieceProcessor::Decode; + %ignore sentencepiece::SentencePieceProcessor::DecodeIds; ++%ignore sentencepiece::SentencePieceProcessor::DecodePieces; ++%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto; ++ + %ignore sentencepiece::SentencePieceProcessor::model_proto; + %ignore sentencepiece::SentencePieceProcessor::Load; + %ignore sentencepiece::SentencePieceProcessor::LoadOrDie; +@@ -200,62 +311,131 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + return $self->Load(arg); + } + +- std::string DecodeIdsWithCheck( +- const std::vector &ids) const { +- const int num_pieces = $self->GetPieceSize(); +- for (int id : ids) { +- if (id < 0 || id >= num_pieces) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece id is out of range."); +- } +- } +- return $self->DecodeIds(ids); +- } +- +- util::bytes DecodeIdsAsSerializedProtoWithCheck( +- const std::vector &ids) const { +- const int num_pieces = $self->GetPieceSize(); +- for (int id : ids) { +- if (id < 0 || id >= num_pieces) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece id is out of range."); +- } +- } +- return $self->DecodeIdsAsSerializedProto(ids); +- } +- ++ ///////////////////////////////////////////////////////////////////////////// ++ // EncodeAs* (Single request) + std::vector _EncodeAsIds(absl::string_view text, +- bool enabele_sampling, ++ bool enable_sampling, + int nbest_size, float alpha, +- bool add_bos, bool add_eos, bool reverse) { +- auto ids = enabele_sampling ? ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ auto ids = enable_sampling ? + $self->SampleEncodeAsIds(text, nbest_size, alpha) : + $self->EncodeAsIds(text); +- RewriteIds(*$self, &ids, add_bos, add_eos, reverse); ++ RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece); + return ids; + } + + std::vector _EncodeAsPieces(absl::string_view text, +- bool enabele_sampling, ++ bool enable_sampling, + int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, +- bool emit_unk_piece) { +- auto pieces = enabele_sampling ? ++ bool emit_unk_piece) const { ++ auto pieces = enable_sampling ? + $self->SampleEncodeAsPieces(text, nbest_size, alpha) : + $self->EncodeAsPieces(text); +- RewritePieces(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + return pieces; + } + ++ sentencepiece::util::bytes _EncodeAsSerializedProto(absl::string_view text, ++ bool enable_sampling, ++ int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ auto proto = enable_sampling ? ++ $self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) : ++ $self->EncodeAsSerializedProto(text); ++ RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece); ++ return proto; ++ } ++ ++ ///////////////////////////////////////////////////////////////////////////// ++ // EncodeAs* (Batch request) ++ std::vector> _EncodeAsIdsBatch( ++ const std::vector &ins, int num_threads, ++ bool enable_sampling, int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, ++ absl::string_view, std::vector); ++ } ++ ++ std::vector> _EncodeAsPiecesBatch( ++ const std::vector &ins, int num_threads, ++ bool enable_sampling, int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces, ++ absl::string_view, std::vector); ++ } ++ ++ BytesArray _EncodeAsSerializedProtoBatch( ++ const std::vector &ins, int num_threads, ++ bool enable_sampling, int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto, ++ absl::string_view, ++ sentencepiece::util::bytes); ++ } ++ ++ ///////////////////////////////////////////////////////////////////////////// ++ // DecodeAs* (Single request) ++ std::string _DecodeIds(const std::vector &ids) const { ++ CheckIds(ids, $self->GetPieceSize()); ++ return $self->DecodeIds(ids); ++ } ++ ++ std::string _DecodePieces(const std::vector &pieces) const { ++ return $self->DecodePieces(pieces); ++ } ++ ++ sentencepiece::util::bytes _DecodeIdsAsSerializedProto( ++ const std::vector &ids) const { ++ CheckIds(ids, $self->GetPieceSize()); ++ return $self->DecodeIdsAsSerializedProto(ids); ++ } ++ ++ sentencepiece::util::bytes _DecodePiecesAsSerializedProto( ++ const std::vector &pieces) const { ++ CheckIds(pieces, $self->GetPieceSize()); ++ return $self->DecodePiecesAsSerializedProto(pieces); ++ } ++ ++ ///////////////////////////////////////////////////////////////////////////// ++ // DecodeAs* (Batch request) ++ std::vector _DecodeIdsBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); ++ } ++ ++ BytesArray _DecodeIdsAsSerializedProtoBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, ++ sentencepiece::util::bytes); ++ } ++ ++ std::vector _DecodePiecesBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); ++ } ++ ++ BytesArray _DecodePiecesAsSerializedProtoBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, ++ sentencepiece::util::bytes); ++ } ++ ++ //////////////////////////////////////////////////////////////////////////// ++ // NBestEncodeAs* (Single request) + std::vector> + _NBestEncodeAsIds(absl::string_view text, + int nbest_size, +- bool add_bos, bool add_eos, bool reverse) { ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { + auto idss = $self->NBestEncodeAsIds(text, nbest_size); + for (auto &ids : idss) { +- RewriteIds(*$self, &ids, add_bos, add_eos, reverse); ++ RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece); + } + return idss; + } +@@ -264,40 +444,74 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + _NBestEncodeAsPieces(absl::string_view text, + int nbest_size, + bool add_bos, bool add_eos, bool reverse, +- bool emit_unk_piece) { ++ bool emit_unk_piece) const { + auto piecess = $self->NBestEncodeAsPieces(text, nbest_size); + for (auto &pieces : piecess) { +- RewritePieces(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; + } + ++ sentencepiece::util::bytes _NBestEncodeAsSerializedProto(absl::string_view text, ++ int nbest_size, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ RewriteIds(*$self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return $self->NBestEncodeAsSerializedProto(text, nbest_size); ++ } ++ ++ ///////////////////////////////////////////////////////////////////////////// ++ // SampleEncodeAndScoreAs* (Single request) + std::vector, float>> + _SampleEncodeAndScoreAsIds(absl::string_view text, + int num_samples, float theta, bool wor, + bool include_best, +- bool add_bos, bool add_eos, bool reverse) { ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { + auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples, + theta, wor, include_best); + for (auto &ids : idss) { +- RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse); ++ RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return idss; + } + +- std::vector, float>> ++ std::vector, float>> + _SampleEncodeAndScoreAsPieces(absl::string_view text, + int num_samples, float theta, bool wor, + bool include_best, + bool add_bos, bool add_eos, bool reverse, +- bool emit_unk_piece) { ++ bool emit_unk_piece) const { + auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples, + theta, wor, include_best); + for (auto &pieces : piecess) { +- RewritePieces(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; +- } ++ } ++ ++ // Calculate Entropy ++ float _CalculateEntropy(absl::string_view text, float theta) { ++ return $self->CalculateEntropy(text, theta); ++ } ++ ++ std::vector _CalculateEntropyBatch(const std::vector &ins, ++ float theta, int num_threads) { ++ std::vector outs(ins.size()); ++ InitNumThreads(ins, &num_threads); ++ { ++ ThreadPool pool(ins.size()); ++ for (int n = 0; n < num_threads; ++n) { ++ pool.Schedule([&, n]() { ++ for (size_t i = n; i < ins.size(); i += num_threads) { ++ outs[i] = self->CalculateEntropy(ins[i], theta); ++ } ++ }); ++ } ++ } ++ return outs; ++ } + + %pythoncode { + def Init(self, +@@ -310,7 +524,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + emit_unk_piece=False, + enable_sampling=False, + nbest_size=-1, +- alpha=0.1): ++ alpha=0.1, ++ num_threads=1): + """Initialzie sentencepieceProcessor. + + Args: +@@ -330,6 +545,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and dropout probability of + merge operations for BPE-dropout. ++ num_threads: number of threads in batch processing. + """ + + _sentencepiece_processor_init_native(self) +@@ -341,6 +557,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + self._enable_sampling = enable_sampling + self._nbest_size = nbest_size + self._alpha = alpha ++ self._num_threads = num_threads + if model_file or model_proto: + self.Load(model_file=model_file, model_proto=model_proto) + +@@ -354,7 +571,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + emit_unk_piece=None, + enable_sampling=None, + nbest_size=None, +- alpha=None): ++ alpha=None, ++ num_threads=None): + """Encode text input to segmented ids or tokens. + + Args: +@@ -373,6 +591,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and merge probability for + BPE-dropout (probablity 'p' in BPE-dropout paper). ++ num_threads: the number of threads used in the batch processin (Default = 1). + """ + + if out_type is None: +@@ -391,6 +610,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + nbest_size = self._nbest_size + if alpha is None: + alpha = self._alpha ++ if num_threads is None: ++ num_threads = self._num_threads + + if enable_sampling == True and (nbest_size is None or nbest_size == 0 or + nbest_size == 1 or alpha is None): +@@ -401,18 +622,59 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + 'instead of nbest segmentations.' + ) + +- def _encode(text): +- if out_type is int: +- return self._EncodeAsIds(text, enable_sampling, nbest_size, +- alpha, add_bos, add_eos, reverse) +- else: +- return self._EncodeAsPieces(text, enable_sampling, nbest_size, +- alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') + + if type(input) is list: +- return [_encode(n) for n in input] ++ if out_type is int: ++ return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ if out_type is int: ++ return self._EncodeAsIds(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._EncodeAsPieces(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ raise RuntimeError('unknown out_type={}'.format(out_type)) ++ return None + +- return _encode(input) ++ ++ def EncodeAsPieces(self, input, **kwargs): ++ return self.Encode(input=input, out_type=str, **kwargs) ++ ++ ++ def EncodeAsIds(self, input, **kwargs): ++ return self.Encode(input=input, out_type=int, **kwargs) ++ ++ ++ def EncodeAsSerializedProto(self, input, **kwargs): ++ return self.Encode(input=input, out_type='proto', **kwargs) ++ ++ ++ def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type=str, enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type=int, enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type='proto', enable_sampling=True, **kwargs) + + + def NBestEncode(self, +@@ -453,9 +715,14 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + + def _encode(text): + if out_type is int: +- return self._NBestEncodeAsIds(text, nbest_size, add_bos, add_eos, reverse) +- else: +- return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) ++ return self._NBestEncodeAsIds(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type is str: ++ return self._NBestEncodeAsPieces(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'proto': ++ return self._NBestEncodeAsSerializedProto(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: + return [_encode(n) for n in input] +@@ -463,6 +730,21 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + return _encode(input) + + ++ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type=str, **kwargs) ++ ++ ++ def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type=int, **kwargs) ++ ++ ++ def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type='proto', **kwargs) ++ ++ + def SampleEncodeAndScore(self, + input, + out_type=None, +@@ -478,7 +760,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + + Args: + input: input string. accepsts list of string. +- out_type: output type. int or str. ++ out_type: output type. int or str or 'proto'. + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) +@@ -513,12 +795,12 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + + if include_best and not wor: + raise RuntimeError('When include_best is True, We must specify "wor = True".') +- ++ + + def _encode(text): + if out_type is int: + return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, +- add_bos, add_eos, reverse) ++ add_bos, add_eos, reverse, emit_unk_piece) + else: + return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) +@@ -529,35 +811,90 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + return _encode(input) + + +- def Decode(self, input): +- """Decode processed id or token sequences.""" ++ def Decode(self, input, out_type=str, num_threads=None): ++ """Decode processed id or token sequences. ++ ++ Args: ++ out_type: output type. str or 'proto' (Default = str) ++ num_threads: the number of threads used in the batch processin (Default = 1). ++ """ ++ ++ if num_threads is None: ++ num_threads = self._num_threads ++ ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') + + if not input: +- return self.DecodeIds([]) +- elif type(input) is int: +- return self.DecodeIdsWithCheck([input]) +- elif type(input) is str: +- return self.DecodePieces([input]) ++ return '' ++ ++ if out_type is str: ++ if type(input) is int: ++ return self._DecodeIds([input]) ++ if type(input) is str: ++ return self._DecodePieces([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIds(input) ++ if type(input[0]) is str: ++ return self._DecodePieces(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesBatch(input, num_threads) ++ ++ if out_type == 'proto': ++ if type(input) is int: ++ return self._DecodeIdsAsSerializedProto([input]) ++ if type(input) is str: ++ return self._DecodePiecesAsSerializedProto([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIdsAsSerializedProto(input) ++ if type(input[0]) is str: ++ return self._DecodePiecesAsSerializedProto(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsAsSerializedProtoBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) ++ ++ ++ raise RuntimeError('unknown output or input type') ++ return None + +- def _decode(input): +- if not input: +- return self.DecodeIds([]) +- if type(input[0]) is int: +- return self.DecodeIdsWithCheck(input) +- return self.DecodePieces(input) + +- if type(input[0]) is list: +- return [_decode(n) for n in input] ++ def DecodePieces(self, input, out_type=str, **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) + +- return _decode(input) + ++ def DecodeIds(self, input, out_type=str, **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) + +- def Entropy(self, input, theta): +- """Calculate sentence entropy""" + ++ def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def CalculateEntropy(self, input, theta, num_threads=None): ++ """Calculate sentence entropy""" + if type(input) is list: +- return [self.CalculateEntropy(n, theta) for n in input] +- return self.CalculateEntropy(input, theta) ++ if num_threads is None: ++ num_threads = self._num_threads ++ if num_threads is None or type(num_threads) is not int: ++ raise RuntimeError('num_threads must be int') ++ return self._CalculateEntropyBatch(input, theta, num_threads) ++ ++ return self._CalculateEntropy(input, theta) + + + def piece_size(self): +@@ -696,6 +1033,13 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + } + } + ++%typemap(out) std::vector { ++ $result = PyList_New($1.size()); ++ for (size_t i = 0; i < $1.size(); ++i) { ++ PyList_SetItem($result, i, PyFloat_FromDouble(static_cast($1[i]))); ++ } ++} ++ + %typemap(out) std::vector> { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +@@ -715,6 +1059,13 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + } + } + ++%typemap(out) BytesArray { ++ $result = PyList_New($1.size()); ++ for (size_t i = 0; i < $1.size(); ++i) { ++ PyList_SetItem($result, i, MakePyOutputBytes($1[i])); ++ } ++} ++ + %typemap(out) std::vector> { + PyObject *input_type = resultobj; + $result = PyList_New($1.size()); +@@ -778,7 +1129,51 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem($input, i)); + if (ustring.IsAvalable()) { +- (*out)[i] = std::string(ustring.data(), ustring.size()); ++ (*out)[i].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ $1 = out; ++} ++ ++%typemap(in) const std::vector& { ++ std::vector *out = nullptr; ++ if (PyList_Check($input)) { ++ const size_t size = PyList_Size($input); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem($input, i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ $1 = out; ++} ++ ++%typemap(in) const std::vector& { ++ std::vector *out = nullptr; ++ if (PyList_Check($input)) { ++ const size_t size = PyList_Size($input); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem($input, i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -813,6 +1208,69 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + $1 = out; + } + ++%typemap(in) const std::vector>& { ++ std::vector> *out = nullptr; ++ if (PyList_Check($input)) { ++ const size_t size = PyList_Size($input); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem($input, i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ const PyInputString ustring(PyList_GetItem(o, j)); ++ if (ustring.IsAvalable()) { ++ (*out)[i][j].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ $1 = out; ++} ++ ++%typemap(in) const std::vector>& { ++ std::vector> *out = nullptr; ++ if (PyList_Check($input)) { ++ const size_t size = PyList_Size($input); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem($input, i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ PyObject *o2 = PyList_GetItem(o, j); ++ if (PyInt_Check(o2)) { ++ (*out)[i][j] = static_cast(PyInt_AsLong(o2)); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ $1 = out; ++} ++ + %typemap(in) const std::unordered_map & { + std::unordered_map *out = nullptr; + if (PyDict_Check($input)) { +@@ -880,6 +1338,10 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + delete $1; + } + ++%typemap(freearg) const std::vector& { ++ delete $1; ++} ++ + %typemap(freearg) const std::vector>& { + delete $1; + } +@@ -888,6 +1350,10 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + delete $1; + } + ++%typemap(freearg) const std::vector& { ++ delete $1; ++} ++ + %typemap(freearg) const std::vector>& { + delete $1; + } +@@ -948,8 +1414,6 @@ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init) + + SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode + SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode +-SentencePieceProcessor.DecodeIds = SentencePieceProcessor.DecodeIdsWithCheck +-SentencePieceProcessor.DecodeIdsAsSerializedProto = SentencePieceProcessor.DecodeIdsAsSerializedProtoWithCheck + + for m in [ + 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused', +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 36b3a0e..6df3880 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -2698,10 +2698,13 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO + #define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[3] + #define SWIGTYPE_p_std__string swig_types[4] + #define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[5] +-#define SWIGTYPE_p_std__vectorT_int_t swig_types[6] +-#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[7] +-static swig_type_info *swig_types[9]; +-static swig_module_info swig_module = {swig_types, 8, 0, 0, 0, 0}; ++#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[6] ++#define SWIGTYPE_p_std__vectorT_int_t swig_types[7] ++#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[8] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[9] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_std__string_t_t swig_types[10] ++static swig_type_info *swig_types[12]; ++static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; + #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) + #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) + +@@ -2805,9 +2808,13 @@ namespace swig { + } + + ++#include + #include ++#include + #include + #include ++#include ++#include + #include + #include + +@@ -2815,6 +2822,8 @@ namespace { + PyObject* kUnicodeInput = reinterpret_cast(0x1); + PyObject* kByteInput = reinterpret_cast(0x2); + ++using BytesArray = std::vector; ++ + inline void ReleaseResultObject(PyObject *obj) { + if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) { + Py_XDECREF(obj); +@@ -2857,7 +2866,7 @@ PyObject* MakePyOutputString(const std::string& output, + return PyBytes_FromStringAndSize(output.data(), output.size()); + } + +-PyObject* MakePyOutputBytes(const std::string& output) { ++PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) { + return PyBytes_FromStringAndSize(output.data(), output.size()); + } + +@@ -2929,18 +2938,18 @@ class PySentenceIterator : public sentencepiece::SentenceIterator { + sentencepiece::util::Status status_; + }; + +-void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, +- std::vector *ids, +- bool add_bos, bool add_eos, bool reverse) { ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ std::vector *ids, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (!add_bos && !add_eos && !reverse) return; + if (reverse) std::reverse(ids->begin(), ids->end()); + if (add_bos) ids->insert(ids->begin(), sp.bos_id()); + if (add_eos) ids->push_back(sp.eos_id()); + } + +-void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, +- std::vector *pieces, +- bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ std::vector *pieces, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return; + if (reverse) std::reverse(pieces->begin(), pieces->end()); + if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id())); +@@ -2955,6 +2964,98 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, + } + } + } ++ ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ sentencepiece::util::bytes *proto, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++ if (add_bos || add_eos || reverse || emit_unk_piece) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kUnimplemented, ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); ++ } ++} ++ ++inline void CheckIds(const std::vector &ids, int num_pieces) { ++ for (int id : ids) { ++ if (id < 0 || id >= num_pieces) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "piece id is out of range."); ++ } ++ } ++} ++ ++inline void CheckIds(const std::vector &ids, int num_pieces) {} ++ ++class ThreadPool { ++ public: ++ explicit ThreadPool(size_t request_size) : ++ request_size_(request_size) {} ++ ++ virtual ~ThreadPool() { ++ for (auto &task : tasks_) { ++ task.join(); ++ } ++ } ++ ++ void Schedule(std::function closure) { ++ static constexpr size_t kMinThreadSize = 2; ++ if (request_size_ < kMinThreadSize) { ++ closure(); ++ } else { ++ tasks_.emplace_back(closure); ++ } ++ } ++ ++ private: ++ size_t request_size_ = 0; ++ std::vector tasks_; ++}; ++ ++template ++inline void InitNumThreads(const std::vector &ins, int *num_threads) { ++ *num_threads = std::max(1, ++ std::min({*num_threads, ++ static_cast(ins.size()), 256})); ++} ++ ++#define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ ++ std::vector outs(ins.size()); \ ++ InitNumThreads(ins, &num_threads); \ ++ { \ ++ ThreadPool pool(ins.size()); \ ++ for (int n = 0; n < num_threads; ++n) { \ ++ pool.Schedule([&, n]() { \ ++ for (size_t i = n; i < ins.size(); i += num_threads) { \ ++ auto out = enable_sampling ? \ ++ self->Sample##FuncName(ins[i], \ ++ nbest_size, alpha) : \ ++ self->FuncName(ins[i]); \ ++ RewriteIds(*self, &out, add_bos, add_eos, reverse, \ ++ emit_unk_piece); \ ++ outs[i] = std::move(out); \ ++ } \ ++ }); \ ++ } \ ++ } \ ++ return outs; ++ ++#define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ ++ std::vector outs(ins.size()); \ ++ InitNumThreads(ins, &num_threads); \ ++ { \ ++ ThreadPool pool(ins.size()); \ ++ for (int n = 0; n < num_threads; ++n) { \ ++ pool.Schedule([&, n]() { \ ++ for (size_t i = n; i < ins.size(); i += num_threads) { \ ++ CheckIds(ins[i], self->GetPieceSize()); \ ++ outs[i] = self->FuncName(ins[i]); \ ++ } \ ++ }); \ ++ } \ ++ } \ ++ return outs; ++ + } // namespace + + +@@ -3334,72 +3435,122 @@ SWIGINTERNINLINE PyObject* + SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){ + return self->Load(arg); + } +-SWIGINTERN std::string sentencepiece_SentencePieceProcessor_DecodeIdsWithCheck(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ +- const int num_pieces = self->GetPieceSize(); +- for (int id : ids) { +- if (id < 0 || id >= num_pieces) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece id is out of range."); +- } +- } +- return self->DecodeIds(ids); +- } +-SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ +- const int num_pieces = self->GetPieceSize(); +- for (int id : ids) { +- if (id < 0 || id >= num_pieces) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece id is out of range."); +- } +- } +- return self->DecodeIdsAsSerializedProto(ids); +- } +-SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,bool enabele_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse){ +- auto ids = enabele_sampling ? ++SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ auto ids = enable_sampling ? + self->SampleEncodeAsIds(text, nbest_size, alpha) : + self->EncodeAsIds(text); +- RewriteIds(*self, &ids, add_bos, add_eos, reverse); ++ RewriteIds(*self, &ids, add_bos, add_eos, reverse, emit_unk_piece); + return ids; + } +-SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__EncodeAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,bool enabele_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ +- auto pieces = enabele_sampling ? ++SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__EncodeAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ auto pieces = enable_sampling ? + self->SampleEncodeAsPieces(text, nbest_size, alpha) : + self->EncodeAsPieces(text); +- RewritePieces(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + return pieces; + } +-SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse){ ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__EncodeAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ auto proto = enable_sampling ? ++ self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) : ++ self->EncodeAsSerializedProto(text); ++ RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); ++ return proto; ++ } ++SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, ++ absl::string_view, std::vector); ++ } ++SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces, ++ absl::string_view, std::vector); ++ } ++SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto, ++ absl::string_view, ++ sentencepiece::util::bytes); ++ } ++SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ ++ CheckIds(ids, self->GetPieceSize()); ++ return self->DecodeIds(ids); ++ } ++SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ ++ return self->DecodePieces(pieces); ++ } ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ ++ CheckIds(ids, self->GetPieceSize()); ++ return self->DecodeIdsAsSerializedProto(ids); ++ } ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ ++ CheckIds(pieces, self->GetPieceSize()); ++ return self->DecodePiecesAsSerializedProto(pieces); ++ } ++SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); ++ } ++SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, ++ sentencepiece::util::bytes); ++ } ++SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); ++ } ++SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, ++ sentencepiece::util::bytes); ++ } ++SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto idss = self->NBestEncodeAsIds(text, nbest_size); + for (auto &ids : idss) { +- RewriteIds(*self, &ids, add_bos, add_eos, reverse); ++ RewriteIds(*self, &ids, add_bos, add_eos, reverse, emit_unk_piece); + } + return idss; + } +-SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto piecess = self->NBestEncodeAsPieces(text, nbest_size); + for (auto &pieces : piecess) { +- RewritePieces(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; + } +-SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse){ ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__NBestEncodeAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ RewriteIds(*self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return self->NBestEncodeAsSerializedProto(text, nbest_size); ++ } ++SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples, + theta, wor, include_best); + for (auto &ids : idss) { +- RewriteIds(*self, &ids.first, add_bos, add_eos, reverse); ++ RewriteIds(*self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return idss; + } +-SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto piecess = self->SampleEncodeAndScoreAsPieces(text, num_samples, + theta, wor, include_best); + for (auto &pieces : piecess) { +- RewritePieces(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); ++ RewriteIds(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; + } ++SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float theta){ ++ return self->CalculateEntropy(text, theta); ++ } ++SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float theta,int num_threads){ ++ std::vector outs(ins.size()); ++ InitNumThreads(ins, &num_threads); ++ { ++ ThreadPool pool(ins.size()); ++ for (int n = 0; n < num_threads; ++n) { ++ pool.Schedule([&, n]() { ++ for (size_t i = n; i < ins.size(); i += num_threads) { ++ outs[i] = self->CalculateEntropy(ins[i], theta); ++ } ++ }); ++ } ++ } ++ return outs; ++ } + + SWIGINTERN int + SWIG_AsVal_unsigned_SS_long (PyObject *obj, unsigned long *val) +@@ -3703,7 +3854,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i] = std::string(ustring.data(), ustring.size()); ++ (*out)[i].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -3832,19 +3983,31 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; ++ int arg3 ; ++ float arg4 ; ++ bool arg5 ; ++ bool arg6 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- std::vector< std::string > result; ++ int val3 ; ++ int ecode3 = 0 ; ++ float val4 ; ++ int ecode4 = 0 ; ++ bool val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ PyObject *swig_obj[6] ; ++ std::vector< std::pair< std::vector< std::string >,float > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsPieces", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -3856,9 +4019,29 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGU + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); ++ } ++ arg4 = static_cast< float >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); ++ } ++ arg5 = static_cast< bool >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsPieces(arg2); ++ result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -3869,7 +4052,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGU + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ PyObject *obj = PyList_New(result[i].first.size()); ++ for (size_t j = 0; j < result[i].first.size(); ++j) { ++ PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); ++ } ++ PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -3878,19 +4065,31 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; ++ int arg3 ; ++ float arg4 ; ++ bool arg5 ; ++ bool arg6 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- std::vector< int > result; ++ int val3 ; ++ int ecode3 = 0 ; ++ float val4 ; ++ int ecode4 = 0 ; ++ bool val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ PyObject *swig_obj[6] ; ++ std::vector< std::pair< std::vector< int >,float > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsIds", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -3902,9 +4101,29 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUS + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); ++ } ++ arg4 = static_cast< float >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); ++ } ++ arg5 = static_cast< bool >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsIds(arg2); ++ result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -3914,7 +4133,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUS + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); ++ PyObject *obj = PyList_New(result[i].first.size()); ++ for (size_t j = 0; j < result[i].first.size(); ++j) { ++ PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); ++ } ++ PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -3923,22 +4146,22 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +- int arg3 ; ++ float arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; ++ float val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; +- std::vector< std::vector< std::string > > result; ++ float result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsPieces", 3, 3, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -3950,113 +4173,71 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsPieces(PyObject * + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); + } +- arg3 = static_cast< int >(val3); ++ arg3 = static_cast< float >(val3); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsPieces(arg2,arg3); ++ result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- PyObject *input_type = resultobj; +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].size()); +- for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); +- } +- PyList_SetItem(resultobj, i, obj); +- } +- } ++ resultobj = SWIG_From_float(static_cast< float >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- PyObject *swig_obj[3] ; +- std::vector< std::vector< int > > result; ++ PyObject *swig_obj[1] ; ++ int result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsIds", 3, 3, swig_obj)) SWIG_fail; ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetPieceSize" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsIds(arg2,arg3); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->GetPieceSize(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].size()); +- for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); +- } +- PyList_SetItem(resultobj, i, obj); +- } +- } ++ resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- PyObject *swig_obj[4] ; +- std::vector< std::string > result; ++ PyObject *swig_obj[2] ; ++ int result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsPieces", 4, 4, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_PieceToId", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_PieceToId" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -4068,81 +4249,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsPieces(PyObject + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsPieces(arg2,arg3,arg4); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->PieceToId(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- PyObject *input_type = resultobj; +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); +- } +- } ++ resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- PyObject *swig_obj[4] ; +- std::vector< int > result; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ std::string *result = 0 ; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsIds", 4, 4, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IdToPiece", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "4"" of type '" "float""'"); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "2"" of type '" "int""'"); + } +- arg4 = static_cast< float >(val4); ++ arg2 = static_cast< int >(val2); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsIds(arg2,arg3,arg4); ++ result = (std::string *) &((sentencepiece::SentencePieceProcessor const *)arg1)->IdToPiece(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -4150,10 +4297,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsIds(PyObject *SW + } + } + { +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); +- } ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(*result, input_type); + } + return resultobj; + fail: +@@ -4161,489 +4306,290 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetScore(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; +- bool arg5 ; +- bool arg6 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- bool val5 ; +- int ecode5 = 0 ; +- bool val6 ; +- int ecode6 = 0 ; +- PyObject *swig_obj[6] ; +- std::vector< std::pair< std::vector< std::string >,float > > result; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ float result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_GetScore", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetScore" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); +- ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); +- if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); +- } +- arg5 = static_cast< bool >(val5); +- ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); +- if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_GetScore" "', argument " "2"" of type '" "int""'"); + } +- arg6 = static_cast< bool >(val6); ++ arg2 = static_cast< int >(val2); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); ++ result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->GetScore(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- PyObject *input_type = resultobj; +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].first.size()); +- for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); +- } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); +- } +- } ++ resultobj = SWIG_From_float(static_cast< float >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnknown(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; +- bool arg5 ; +- bool arg6 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- bool val5 ; +- int ecode5 = 0 ; +- bool val6 ; +- int ecode6 = 0 ; +- PyObject *swig_obj[6] ; +- std::vector< std::pair< std::vector< int >,float > > result; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ bool result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnknown", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); +- ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); +- if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); +- } +- arg5 = static_cast< bool >(val5); +- ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); +- if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "2"" of type '" "int""'"); + } +- arg6 = static_cast< bool >(val6); ++ arg2 = static_cast< int >(val2); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); ++ result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnknown(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].first.size()); +- for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); +- } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); +- } +- } ++ resultobj = SWIG_From_bool(static_cast< bool >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsControl(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::string > *arg2 = 0 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; + PyObject *swig_obj[2] ; +- std::string result; ++ bool result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodePieces", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsControl", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodePieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsControl" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- std::vector *out = nullptr; +- if (PyList_Check(swig_obj[1])) { +- const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); +- for (size_t i = 0; i < size; ++i) { +- const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); +- if (ustring.IsAvalable()) { +- (*out)[i] = std::string(ustring.data(), ustring.size()); +- } else { +- PyErr_SetString(PyExc_TypeError, "list must contain strings"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- } +- } else { +- PyErr_SetString(PyExc_TypeError, "not a list"); +- SWIG_fail; +- } +- arg2 = out; +- } ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsControl" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->DecodePieces((std::vector< std::string > const &)*arg2); ++ result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsControl(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- PyObject *input_type = resultobj; +- resultobj = MakePyOutputString(result, input_type); +- } +- { +- delete arg2; +- } ++ resultobj = SWIG_From_bool(static_cast< bool >(result)); + return resultobj; + fail: +- { +- delete arg2; +- } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnused(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- float arg3 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- float val3 ; +- int ecode3 = 0 ; +- PyObject *swig_obj[3] ; +- float result; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ bool result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnused", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "2"" of type '" "int""'"); + } +- arg3 = static_cast< float >(val3); ++ arg2 = static_cast< int >(val2); + { + try { +- result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3); ++ result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnused(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_float(static_cast< float >(result)); ++ resultobj = SWIG_From_bool(static_cast< bool >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsByte(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; ++ int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; + PyObject *swig_obj[2] ; +- sentencepiece::util::bytes result; ++ bool result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsByte", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsByte" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsByte" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsSerializedProto(arg2); ++ result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsByte(arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = MakePyOutputBytes(result); +- } ++ resultobj = SWIG_From_bool(static_cast< bool >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_unk_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- PyObject *swig_obj[4] ; +- sentencepiece::util::bytes result; ++ PyObject *swig_obj[1] ; ++ int result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsSerializedProto", 4, 4, swig_obj)) SWIG_fail; ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_unk_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsSerializedProto(arg2,arg3,arg4); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->unk_id(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = MakePyOutputBytes(result); +- } ++ resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_bos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- PyObject *swig_obj[3] ; +- sentencepiece::util::bytes result; ++ PyObject *swig_obj[1] ; ++ int result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsSerializedProto", 3, 3, swig_obj)) SWIG_fail; ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_bos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsSerializedProto(arg2,arg3); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->bos_id(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = MakePyOutputBytes(result); +- } ++ resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_eos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::string > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- sentencepiece::util::bytes result; ++ PyObject *swig_obj[1] ; ++ int result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodePiecesAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodePiecesAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_eos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- std::vector *out = nullptr; +- if (PyList_Check(swig_obj[1])) { +- const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); +- for (size_t i = 0; i < size; ++i) { +- const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); +- if (ustring.IsAvalable()) { +- (*out)[i] = std::string(ustring.data(), ustring.size()); +- } else { +- PyErr_SetString(PyExc_TypeError, "list must contain strings"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- } +- } else { +- PyErr_SetString(PyExc_TypeError, "not a list"); +- SWIG_fail; +- } +- arg2 = out; +- } + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->DecodePiecesAsSerializedProto((std::vector< std::string > const &)*arg2); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->eos_id(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- { +- resultobj = MakePyOutputBytes(result); +- } +- { +- delete arg2; +- } ++ resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; + fail: +- { +- delete arg2; +- } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_pad_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + void *argp1 = 0 ; +@@ -4655,12 +4601,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNU + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetPieceSize" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_pad_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->GetPieceSize(); ++ result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->pad_id(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -4674,71 +4620,66 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- int result; ++ PyObject *swig_obj[1] ; ++ sentencepiece::util::bytes result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_PieceToId", 2, 2, swig_obj)) SWIG_fail; ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_PieceToId" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_serialized_model_proto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); +- } + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->PieceToId(arg2); ++ result = ((sentencepiece::SentencePieceProcessor const *)arg1)->serialized_model_proto(); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_int(static_cast< int >(result)); ++ { ++ resultobj = MakePyOutputBytes(result); ++ } + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ absl::string_view arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; + PyObject *swig_obj[2] ; +- std::string *result = 0 ; ++ sentencepiece::util::Status result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IdToPiece", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_LoadFromFile", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_LoadFromFile" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "2"" of type '" "int""'"); +- } +- arg2 = static_cast< int >(val2); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = absl::string_view(ustring.data(), ustring.size()); ++ } + { + try { +- result = (std::string *) &((sentencepiece::SentencePieceProcessor const *)arg1)->IdToPiece(arg2); ++ result = sentencepiece_SentencePieceProcessor_LoadFromFile(arg1,arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -4746,8 +4687,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSED + } + } + { +- PyObject *input_type = resultobj; +- resultobj = MakePyOutputString(*result, input_type); ++ if (!(&result)->ok()) { ++ SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); ++ } ++ resultobj = SWIG_From_bool((&result)->ok()); + } + return resultobj; + fail: +@@ -4755,338 +4698,916 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetScore(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ absl::string_view arg2 ; ++ bool arg3 ; ++ int arg4 ; ++ float arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- float result; ++ bool val3 ; ++ int ecode3 = 0 ; ++ int val4 ; ++ int ecode4 = 0 ; ++ float val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ PyObject *swig_obj[9] ; ++ std::vector< int > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_GetScore", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIds", 9, 9, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetScore" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_GetScore" "', argument " "2"" of type '" "int""'"); +- } +- arg2 = static_cast< int >(val2); + { +- try { +- result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->GetScore(arg2); +- ReleaseResultObject(resultobj); +- } +- catch (const sentencepiece::util::Status &status) { +- SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; + } ++ resultobj = ustring.input_type(); ++ arg2 = absl::string_view(ustring.data(), ustring.size()); + } +- resultobj = SWIG_From_float(static_cast< float >(result)); +- return resultobj; +-fail: +- return NULL; +-} +- +- +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnknown(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "3"" of type '" "bool""'"); ++ } ++ arg3 = static_cast< bool >(val3); ++ ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "4"" of type '" "int""'"); ++ } ++ arg4 = static_cast< int >(val4); ++ ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "5"" of type '" "float""'"); ++ } ++ arg5 = static_cast< float >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__EncodeAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); ++ } ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ absl::string_view arg2 ; ++ bool arg3 ; ++ int arg4 ; ++ float arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- bool result; ++ bool val3 ; ++ int ecode3 = 0 ; ++ int val4 ; ++ int ecode4 = 0 ; ++ float val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ PyObject *swig_obj[9] ; ++ std::vector< std::string > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnknown", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPieces", 9, 9, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "2"" of type '" "int""'"); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = absl::string_view(ustring.data(), ustring.size()); ++ } ++ ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "3"" of type '" "bool""'"); + } +- arg2 = static_cast< int >(val2); ++ arg3 = static_cast< bool >(val3); ++ ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "4"" of type '" "int""'"); ++ } ++ arg4 = static_cast< int >(val4); ++ ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "5"" of type '" "float""'"); ++ } ++ arg5 = static_cast< float >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); + { + try { +- result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnknown(arg2); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_bool(static_cast< bool >(result)); ++ { ++ PyObject *input_type = resultobj; ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ } ++ } + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsControl(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ absl::string_view arg2 ; ++ bool arg3 ; ++ int arg4 ; ++ float arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- bool result; ++ bool val3 ; ++ int ecode3 = 0 ; ++ int val4 ; ++ int ecode4 = 0 ; ++ float val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ PyObject *swig_obj[9] ; ++ sentencepiece::util::bytes result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsControl", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProto", 9, 9, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsControl" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsControl" "', argument " "2"" of type '" "int""'"); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = absl::string_view(ustring.data(), ustring.size()); ++ } ++ ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "3"" of type '" "bool""'"); + } +- arg2 = static_cast< int >(val2); ++ arg3 = static_cast< bool >(val3); ++ ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "4"" of type '" "int""'"); ++ } ++ arg4 = static_cast< int >(val4); ++ ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "5"" of type '" "float""'"); ++ } ++ arg5 = static_cast< float >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); + { + try { +- result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsControl(arg2); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_bool(static_cast< bool >(result)); ++ { ++ resultobj = MakePyOutputBytes(result); ++ } + return resultobj; + fail: + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnused(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ int arg3 ; ++ bool arg4 ; ++ int arg5 ; ++ float arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- bool result; ++ int val3 ; ++ int ecode3 = 0 ; ++ bool val4 ; ++ int ecode4 = 0 ; ++ int val5 ; ++ int ecode5 = 0 ; ++ float val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ std::vector< std::vector< int > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnused", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "2"" of type '" "int""'"); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); + } +- arg2 = static_cast< int >(val2); ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); ++ } ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); ++ } ++ arg5 = static_cast< int >(val5); ++ ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); ++ } ++ arg6 = static_cast< float >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); + { + try { +- result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnused(arg2); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_bool(static_cast< bool >(result)); ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyObject *obj = PyList_New(result[i].size()); ++ for (size_t j = 0; j < result[i].size(); ++j) { ++ PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ } ++ PyList_SetItem(resultobj, i, obj); ++ } ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsByte(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- int arg2 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ int arg3 ; ++ bool arg4 ; ++ int arg5 ; ++ float arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- bool result; ++ int val3 ; ++ int ecode3 = 0 ; ++ bool val4 ; ++ int ecode4 = 0 ; ++ int val5 ; ++ int ecode5 = 0 ; ++ float val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ std::vector< std::vector< std::string > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsByte", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsByte" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsByte" "', argument " "2"" of type '" "int""'"); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); + } +- arg2 = static_cast< int >(val2); ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); ++ } ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); ++ } ++ arg5 = static_cast< int >(val5); ++ ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); ++ } ++ arg6 = static_cast< float >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); + { + try { +- result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsByte(arg2); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_bool(static_cast< bool >(result)); ++ { ++ PyObject *input_type = resultobj; ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyObject *obj = PyList_New(result[i].size()); ++ for (size_t j = 0; j < result[i].size(); ++j) { ++ PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); ++ } ++ PyList_SetItem(resultobj, i, obj); ++ } ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_unk_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ int arg3 ; ++ bool arg4 ; ++ int arg5 ; ++ float arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[1] ; +- int result; ++ int val3 ; ++ int ecode3 = 0 ; ++ bool val4 ; ++ int ecode4 = 0 ; ++ int val5 ; ++ int ecode5 = 0 ; ++ float val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ BytesArray result; + +- if (!args) SWIG_fail; +- swig_obj[0] = args; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_unk_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); ++ } ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); ++ } ++ arg5 = static_cast< int >(val5); ++ ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); ++ } ++ arg6 = static_cast< float >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->unk_id(); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_int(static_cast< int >(result)); ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); ++ } ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_bos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< int > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[1] ; +- int result; ++ PyObject *swig_obj[2] ; ++ std::string result; + +- if (!args) SWIG_fail; +- swig_obj[0] = args; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIds", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_bos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyInt_Check(o)) { ++ (*out)[i] = static_cast(PyInt_AsLong(o)); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->bos_id(); ++ result = sentencepiece_SentencePieceProcessor__DecodeIds((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_int(static_cast< int >(result)); ++ { ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(result, input_type); ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_eos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< std::string > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[1] ; +- int result; ++ PyObject *swig_obj[2] ; ++ std::string result; + +- if (!args) SWIG_fail; +- swig_obj[0] = args; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePieces", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_eos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; + } +- arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->eos_id(); ++ result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_int(static_cast< int >(result)); ++ { ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(result, input_type); ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_pad_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< int > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[1] ; +- int result; ++ PyObject *swig_obj[2] ; ++ sentencepiece::util::bytes result; + +- if (!args) SWIG_fail; +- swig_obj[0] = args; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_pad_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyInt_Check(o)) { ++ (*out)[i] = static_cast(PyInt_AsLong(o)); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } + { + try { +- result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->pad_id(); ++ result = sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } +- resultobj = SWIG_From_int(static_cast< int >(result)); ++ { ++ resultobj = MakePyOutputBytes(result); ++ } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< std::string > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[1] ; ++ PyObject *swig_obj[2] ; + sentencepiece::util::bytes result; + +- if (!args) SWIG_fail; +- swig_obj[0] = args; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_serialized_model_proto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->serialized_model_proto(); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5096,39 +5617,74 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObjec + { + resultobj = MakePyOutputBytes(result); + } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; ++ std::vector< std::vector< int > > *arg2 = 0 ; ++ int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- sentencepiece::util::Status result; ++ int val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ std::vector< std::string > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_LoadFromFile", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsBatch", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_LoadFromFile" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); ++ std::vector> *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ PyObject *o2 = PyList_GetItem(o, j); ++ if (PyInt_Check(o2)) { ++ (*out)[i][j] = static_cast(PyInt_AsLong(o2)); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = out; + } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodeIdsBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor_LoadFromFile(arg1,arg2); ++ result = sentencepiece_SentencePieceProcessor__DecodeIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< int > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5136,43 +5692,63 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNU + } + } + { +- if (!(&result)->ok()) { +- SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); ++ PyObject *input_type = resultobj; ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + } +- resultobj = SWIG_From_bool((&result)->ok()); ++ } ++ { ++ delete arg2; + } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< int > *arg2 = 0 ; ++ std::vector< std::vector< int > > *arg2 = 0 ; ++ int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- std::string result; ++ int val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ BytesArray result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodeIdsWithCheck", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodeIdsWithCheck" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector *out = nullptr; ++ std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); ++ out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); +- if (PyInt_Check(o)) { +- (*out)[i] = static_cast(PyInt_AsLong(o)); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ PyObject *o2 = PyList_GetItem(o, j); ++ if (PyInt_Check(o2)) { ++ (*out)[i][j] = static_cast(PyInt_AsLong(o2)); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ } + } else { +- PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + } +@@ -5182,9 +5758,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *S + } + arg2 = out; + } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor_DecodeIdsWithCheck((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); ++ result = sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< int > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5192,8 +5773,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *S + } + } + { +- PyObject *input_type = resultobj; +- resultobj = MakePyOutputString(result, input_type); ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); ++ } + } + { + delete arg2; +@@ -5207,32 +5790,46 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< int > *arg2 = 0 ; ++ std::vector< std::vector< std::string > > *arg2 = 0 ; ++ int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- PyObject *swig_obj[2] ; +- sentencepiece::util::bytes result; ++ int val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ std::vector< std::string > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesBatch", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector *out = nullptr; ++ std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); ++ out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); +- if (PyInt_Check(o)) { +- (*out)[i] = static_cast(PyInt_AsLong(o)); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ const PyInputString ustring(PyList_GetItem(o, j)); ++ if (ustring.IsAvalable()) { ++ (*out)[i][j].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } + } else { +- PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + } +@@ -5242,9 +5839,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWith + } + arg2 = out; + } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5252,7 +5854,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWith + } + } + { +- resultobj = MakePyOutputBytes(result); ++ PyObject *input_type = resultobj; ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ } + } + { + delete arg2; +@@ -5266,81 +5872,63 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- bool arg3 ; +- int arg4 ; +- float arg5 ; +- bool arg6 ; +- bool arg7 ; +- bool arg8 ; ++ std::vector< std::vector< std::string > > *arg2 = 0 ; ++ int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +- bool val3 ; ++ int val3 ; + int ecode3 = 0 ; +- int val4 ; +- int ecode4 = 0 ; +- float val5 ; +- int ecode5 = 0 ; +- bool val6 ; +- int ecode6 = 0 ; +- bool val7 ; +- int ecode7 = 0 ; +- bool val8 ; +- int ecode8 = 0 ; +- PyObject *swig_obj[8] ; +- std::vector< int > result; ++ PyObject *swig_obj[3] ; ++ BytesArray result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIds", 8, 8, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); +- } +- arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } +- ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "3"" of type '" "bool""'"); +- } +- arg3 = static_cast< bool >(val3); +- ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "4"" of type '" "int""'"); +- } +- arg4 = static_cast< int >(val4); +- ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); +- if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "5"" of type '" "float""'"); +- } +- arg5 = static_cast< float >(val5); +- ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); +- if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "6"" of type '" "bool""'"); +- } +- arg6 = static_cast< bool >(val6); +- ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); +- if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "7"" of type '" "bool""'"); +- } +- arg7 = static_cast< bool >(val7); +- ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); +- if (!SWIG_IsOK(ecode8)) { +- SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "8"" of type '" "bool""'"); ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector> *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ const PyInputString ustring(PyList_GetItem(o, j)); ++ if (ustring.IsAvalable()) { ++ (*out)[i][j].assign(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); + } +- arg8 = static_cast< bool >(val8); ++ arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor__EncodeAsIds(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5350,49 +5938,49 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); ++ PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + } + } ++ { ++ delete arg2; ++ } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +- bool arg3 ; +- int arg4 ; +- float arg5 ; ++ int arg3 ; ++ bool arg4 ; ++ bool arg5 ; + bool arg6 ; + bool arg7 ; +- bool arg8 ; +- bool arg9 ; + void *argp1 = 0 ; + int res1 = 0 ; +- bool val3 ; ++ int val3 ; + int ecode3 = 0 ; +- int val4 ; ++ bool val4 ; + int ecode4 = 0 ; +- float val5 ; ++ bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; +- bool val8 ; +- int ecode8 = 0 ; +- bool val9 ; +- int ecode9 = 0 ; +- PyObject *swig_obj[9] ; +- std::vector< std::string > result; ++ PyObject *swig_obj[7] ; ++ std::vector< std::vector< int > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPieces", 9, 9, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsIds", 7, 7, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5404,44 +5992,34 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } +- ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "3"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); + } +- arg3 = static_cast< bool >(val3); +- ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "4"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "4"" of type '" "bool""'"); + } +- arg4 = static_cast< int >(val4); +- ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "5"" of type '" "float""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "5"" of type '" "bool""'"); + } +- arg5 = static_cast< float >(val5); ++ arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "6"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "7"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); +- ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); +- if (!SWIG_IsOK(ecode8)) { +- SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "8"" of type '" "bool""'"); +- } +- arg8 = static_cast< bool >(val8); +- ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); +- if (!SWIG_IsOK(ecode9)) { +- SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "9"" of type '" "bool""'"); +- } +- arg9 = static_cast< bool >(val9); + { + try { +- result = sentencepiece_SentencePieceProcessor__EncodeAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ++ result = sentencepiece_SentencePieceProcessor__NBestEncodeAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5449,10 +6027,13 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG + } + } + { +- PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ PyObject *obj = PyList_New(result[i].size()); ++ for (size_t j = 0; j < result[i].size(); ++j) { ++ PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ } ++ PyList_SetItem(resultobj, i, obj); + } + } + return resultobj; +@@ -5461,7 +6042,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +@@ -5469,6 +6050,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + bool arg4 ; + bool arg5 ; + bool arg6 ; ++ bool arg7 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; +@@ -5479,13 +6061,15 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; +- PyObject *swig_obj[6] ; +- std::vector< std::vector< int > > result; ++ bool val7 ; ++ int ecode7 = 0 ; ++ PyObject *swig_obj[7] ; ++ std::vector< std::vector< std::string > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsIds", 6, 6, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsPieces", 7, 7, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5499,27 +6083,32 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "4"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "5"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "6"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); + { + try { +- result = sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(arg1,arg2,arg3,arg4,arg5,arg6); ++ result = sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5527,11 +6116,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + } + } + { ++ PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); + } + PyList_SetItem(resultobj, i, obj); + } +@@ -5542,7 +6132,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +@@ -5564,12 +6154,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject + bool val7 ; + int ecode7 = 0 ; + PyObject *swig_obj[7] ; +- std::vector< std::vector< std::string > > result; ++ sentencepiece::util::bytes result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsPieces", 7, 7, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsSerializedProto", 7, 7, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5583,32 +6173,32 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "4"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "5"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "6"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "7"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + { + try { +- result = sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7); ++ result = sentencepiece_SentencePieceProcessor__NBestEncodeAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5616,15 +6206,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject + } + } + { +- PyObject *input_type = resultobj; +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].size()); +- for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); +- } +- PyList_SetItem(resultobj, i, obj); +- } ++ resultobj = MakePyOutputBytes(result); + } + return resultobj; + fail: +@@ -5643,6 +6225,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO + bool arg7 ; + bool arg8 ; + bool arg9 ; ++ bool arg10 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val3 ; +@@ -5659,13 +6242,15 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; +- PyObject *swig_obj[9] ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; + std::vector< std::pair< std::vector< int >,float > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsIds", 9, 9, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsIds", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5712,9 +6297,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); + { + try { +- result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ++ result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5773,7 +6363,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5827,7 +6417,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( + arg10 = static_cast< bool >(val10); + { + try { +- result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5851,6 +6441,133 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ float arg3 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ float val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ float result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = absl::string_view(ustring.data(), ustring.size()); ++ } ++ ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__CalculateEntropy" "', argument " "3"" of type '" "float""'"); ++ } ++ arg3 = static_cast< float >(val3); ++ { ++ try { ++ result = (float)sentencepiece_SentencePieceProcessor__CalculateEntropy(arg1,arg2,arg3); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_float(static_cast< float >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ float arg3 ; ++ int arg4 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ float val3 ; ++ int ecode3 = 0 ; ++ int val4 ; ++ int ecode4 = 0 ; ++ PyObject *swig_obj[4] ; ++ std::vector< float > result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__CalculateEntropyBatch", 4, 4, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "3"" of type '" "float""'"); ++ } ++ arg3 = static_cast< float >(val3); ++ ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "4"" of type '" "int""'"); ++ } ++ arg4 = static_cast< int >(val4); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SetItem(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); ++ } ++ } ++ { ++ delete arg2; ++ } ++ return resultobj; ++fail: ++ { ++ delete arg2; ++ } ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *SentencePieceProcessor_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; +@@ -6191,20 +6908,9 @@ static PyMethodDef SwigMethods[] = { + { "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL}, + { "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL}, + { "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_EncodeAsPieces", _wrap_SentencePieceProcessor_EncodeAsPieces, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_EncodeAsIds", _wrap_SentencePieceProcessor_EncodeAsIds, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_NBestEncodeAsPieces", _wrap_SentencePieceProcessor_NBestEncodeAsPieces, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_NBestEncodeAsIds", _wrap_SentencePieceProcessor_NBestEncodeAsIds, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_SampleEncodeAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAsPieces, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_SampleEncodeAsIds", _wrap_SentencePieceProcessor_SampleEncodeAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor_SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_DecodePieces", _wrap_SentencePieceProcessor_DecodePieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_EncodeAsSerializedProto", _wrap_SentencePieceProcessor_EncodeAsSerializedProto, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_SampleEncodeAsSerializedProto", _wrap_SentencePieceProcessor_SampleEncodeAsSerializedProto, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor_NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor_DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL}, + { "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL}, + { "SentencePieceProcessor_IdToPiece", _wrap_SentencePieceProcessor_IdToPiece, METH_VARARGS, NULL}, +@@ -6219,14 +6925,27 @@ static PyMethodDef SwigMethods[] = { + { "SentencePieceProcessor_pad_id", _wrap_SentencePieceProcessor_pad_id, METH_O, NULL}, + { "SentencePieceProcessor_serialized_model_proto", _wrap_SentencePieceProcessor_serialized_model_proto, METH_O, NULL}, + { "SentencePieceProcessor_LoadFromFile", _wrap_SentencePieceProcessor_LoadFromFile, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_DecodeIdsWithCheck", _wrap_SentencePieceProcessor_DecodeIdsWithCheck, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck", _wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsIds", _wrap_SentencePieceProcessor__EncodeAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsPieces", _wrap_SentencePieceProcessor__EncodeAsPieces, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsSerializedProto", _wrap_SentencePieceProcessor__EncodeAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsIdsBatch", _wrap_SentencePieceProcessor__EncodeAsIdsBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsPiecesBatch", _wrap_SentencePieceProcessor__EncodeAsPiecesBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsSerializedProtoBatch", _wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIds", _wrap_SentencePieceProcessor__DecodeIds, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePieces", _wrap_SentencePieceProcessor__DecodePieces, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIdsAsSerializedProto", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIdsBatch", _wrap_SentencePieceProcessor__DecodeIdsBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePiecesBatch", _wrap_SentencePieceProcessor__DecodePiecesBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsIds", _wrap_SentencePieceProcessor__NBestEncodeAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsPieces", _wrap_SentencePieceProcessor__NBestEncodeAsPieces, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__CalculateEntropy", _wrap_SentencePieceProcessor__CalculateEntropy, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__CalculateEntropyBatch", _wrap_SentencePieceProcessor__CalculateEntropyBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor_swigregister", SentencePieceProcessor_swigregister, METH_O, NULL}, + { "SentencePieceProcessor_swiginit", SentencePieceProcessor_swiginit, METH_VARARGS, NULL}, + { "SetRandomGeneratorSeed", _wrap_SetRandomGeneratorSeed, METH_O, NULL}, +@@ -6252,8 +6971,11 @@ static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sen + static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__string = {"_p_std__string", "sentencepiece::util::bytes *|std::string *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__unordered_mapT_std__string_std__string_t = {"_p_std__unordered_mapT_std__string_std__string_t", "std::unordered_map< std::string,std::string > *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_std__vectorT_absl__string_view_t = {"_p_std__vectorT_absl__string_view_t", "std::vector< absl::string_view > *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__vectorT_int_t = {"_p_std__vectorT_int_t", "std::vector< int > *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__vectorT_std__string_t = {"_p_std__vectorT_std__string_t", "std::vector< std::string > *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__vectorT_std__vectorT_int_t_t", "std::vector< std::vector< int > > *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__string_t_t = {"_p_std__vectorT_std__vectorT_std__string_t_t", "std::vector< std::vector< std::string > > *", 0, 0, (void*)0, 0}; + + static swig_type_info *swig_type_initial[] = { + &_swigt__p_char, +@@ -6262,8 +6984,11 @@ static swig_type_info *swig_type_initial[] = { + &_swigt__p_sentencepiece__SentencePieceTrainer, + &_swigt__p_std__string, + &_swigt__p_std__unordered_mapT_std__string_std__string_t, ++ &_swigt__p_std__vectorT_absl__string_view_t, + &_swigt__p_std__vectorT_int_t, + &_swigt__p_std__vectorT_std__string_t, ++ &_swigt__p_std__vectorT_std__vectorT_int_t_t, ++ &_swigt__p_std__vectorT_std__vectorT_std__string_t_t, + }; + + static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; +@@ -6272,8 +6997,11 @@ static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_ + static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__string[] = { {&_swigt__p_std__string, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__unordered_mapT_std__string_std__string_t[] = { {&_swigt__p_std__unordered_mapT_std__string_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_std__vectorT_absl__string_view_t[] = { {&_swigt__p_std__vectorT_absl__string_view_t, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__vectorT_int_t[] = { {&_swigt__p_std__vectorT_int_t, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__vectorT_std__string_t[] = { {&_swigt__p_std__vectorT_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_int_t_t, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__string_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__string_t_t, 0, 0, 0},{0, 0, 0, 0}}; + + static swig_cast_info *swig_cast_initial[] = { + _swigc__p_char, +@@ -6282,8 +7010,11 @@ static swig_cast_info *swig_cast_initial[] = { + _swigc__p_sentencepiece__SentencePieceTrainer, + _swigc__p_std__string, + _swigc__p_std__unordered_mapT_std__string_std__string_t, ++ _swigc__p_std__vectorT_absl__string_view_t, + _swigc__p_std__vectorT_int_t, + _swigc__p_std__vectorT_std__string_t, ++ _swigc__p_std__vectorT_std__vectorT_int_t_t, ++ _swigc__p_std__vectorT_std__vectorT_std__string_t_t, + }; + + +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index b747e81..99e36f3 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -15,7 +15,6 @@ + # See the License for the specific language governing permissions and + # limitations under the License.! + +-import codecs + import io + import sentencepiece as spm + import unittest +@@ -62,6 +61,17 @@ class TestSentencepieceProcessor(unittest.TestCase): + piece = self.sp_.IdToPiece(i) + self.assertEqual(i, self.sp_.PieceToId(piece)) + ++ self.assertEqual(1000, self.sp_.get_piece_size()) ++ self.assertEqual(0, self.sp_.piece_to_id('')) ++ self.assertEqual(1, self.sp_.piece_to_id('')) ++ self.assertEqual(2, self.sp_.piece_to_id('')) ++ self.assertEqual('', self.sp_.id_to_piece(0)) ++ self.assertEqual('', self.sp_.id_to_piece(1)) ++ self.assertEqual('', self.sp_.id_to_piece(2)) ++ for i in range(self.sp_.get_piece_size()): ++ piece = self.sp_.id_to_piece(i) ++ self.assertEqual(i, self.sp_.piece_to_id(piece)) ++ + def test_roundtrip(self): + text = 'I saw a girl with a telescope.' + ids = self.sp_.EncodeAsIds(text) +@@ -82,6 +92,34 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual( + text, self.sp_.DecodeIds(self.sp_.SampleEncodeAsIds(text, -1, 0.5))) + ++ ids2 = self.sp_.encode_as_ids(text) ++ pieces3 = self.sp_.encode_as_pieces(text) ++ pieces4 = self.sp_.nbest_encode_as_pieces(text, 10)[0] ++ self.assertEqual(pieces3, pieces4) ++ self.assertEqual(pieces1, pieces3) ++ self.assertEqual(ids, ids2) ++ self.assertEqual(text, self.sp_.decode_pieces(pieces3)) ++ self.assertEqual(text, self.sp_.decode_ids(ids2)) ++ for n in range(100): ++ self.assertEqual( ++ text, ++ self.sp_.decode_pieces( ++ self.sp_.sample_encode_as_pieces(text, 64, 0.5))) ++ self.assertEqual( ++ text, ++ self.sp_.decode_pieces( ++ self.sp_.sample_encode_as_pieces(text, -1, 0.5))) ++ self.assertEqual( ++ text, ++ self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, 64, 0.5))) ++ self.assertEqual( ++ text, ++ self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, -1, 0.5))) ++ ++ self.assertEqual( ++ self.sp_.calculate_entropy(text, 0.1), ++ self.sp_.CalculateEntropy(text, 0.1)) ++ + def test_ja_load(self): + self.assertEqual(8000, self.jasp_.GetPieceSize()) + self.assertEqual(0, self.jasp_.PieceToId('')) +@@ -94,6 +132,17 @@ class TestSentencepieceProcessor(unittest.TestCase): + piece = self.jasp_.IdToPiece(i) + self.assertEqual(i, self.jasp_.PieceToId(piece)) + ++ self.assertEqual(8000, self.jasp_.get_piece_size()) ++ self.assertEqual(0, self.jasp_.piece_to_id('')) ++ self.assertEqual(1, self.jasp_.piece_to_id('')) ++ self.assertEqual(2, self.jasp_.piece_to_id('')) ++ self.assertEqual('', self.jasp_.id_to_piece(0)) ++ self.assertEqual('', self.jasp_.id_to_piece(1)) ++ self.assertEqual('', self.jasp_.id_to_piece(2)) ++ for i in range(self.jasp_.get_piece_size()): ++ piece = self.jasp_.id_to_piece(i) ++ self.assertEqual(i, self.jasp_.piece_to_id(piece)) ++ + def test_ja_roundtrip(self): + text = '清水寺は京都にある。' + ids = self.jasp_.EncodeAsIds(text) +@@ -112,40 +161,27 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.jasp_.DecodePieces( + self.jasp_.SampleEncodeAsPieces(text, -1, 0.5))) + +- def test_unicode_roundtrip(self): +- text = u'I saw a girl with a telescope.' +- ids = self.sp_.EncodeAsIds(text) +- pieces = self.sp_.EncodeAsPieces(text) +- self.assertEqual(text, self.sp_.DecodePieces(pieces)) +- self.assertEqual(text, self.sp_.DecodeIds(ids)) +- # python2 returns `str`. +- if sys.version_info < (3, 0, 0): +- text = text.encode('utf-8') +- self.assertEqual(text, self.sp_.DecodeIds(ids)) +- self.assertEqual(text, self.sp_.DecodePieces(pieces)) +- +- def test_unicode_ja_roundtrip(self): +- text = u'清水寺は京都にある。' +- ids = self.jasp_.EncodeAsIds(text) +- pieces = self.jasp_.EncodeAsPieces(text) +- self.assertEqual(text, self.jasp_.DecodePieces(pieces)) +- # python2 returns `str`. +- if sys.version_info < (3, 0, 0): +- text = text.encode('utf-8') +- self.assertEqual(text, self.jasp_.DecodeIds(ids)) +- +- def test_pickle(self): +- with open('sp.pickle', 'wb') as f: +- pickle.dump(self.sp_, f) +- +- id1 = self.sp_.encode('hello world.', out_type=int) +- +- with open('sp.pickle', 'rb') as f: +- sp = pickle.load(f) +- +- id2 = sp.encode('hello world.', out_type=int) ++ ids2 = self.jasp_.encode_as_ids(text) ++ pieces3 = self.jasp_.encode_as_pieces(text) ++ pieces4 = self.jasp_.nbest_encode_as_pieces(text, 10)[0] ++ self.assertEqual(pieces3, pieces4) ++ self.assertEqual(pieces1, pieces3) ++ self.assertEqual(ids, ids2) ++ self.assertEqual(text, self.jasp_.decode_pieces(pieces1)) ++ self.assertEqual(text, self.jasp_.decode_ids(ids2)) ++ for n in range(100): ++ self.assertEqual( ++ text, ++ self.jasp_.decode_pieces( ++ self.jasp_.sample_encode_as_pieces(text, 64, 0.5))) ++ self.assertEqual( ++ text, ++ self.jasp_.decode_pieces( ++ self.jasp_.sample_encode_as_pieces(text, -1, 0.5))) + +- self.assertEqual(id1, id2) ++ self.assertEqual( ++ self.jasp_.calculate_entropy(text, 0.1), ++ self.jasp_.CalculateEntropy(text, 0.1)) + + def test_train(self): + spm.SentencePieceTrainer.Train('--input=' + +@@ -153,37 +189,45 @@ class TestSentencepieceProcessor(unittest.TestCase): + ' --model_prefix=m --vocab_size=1000') + sp = spm.SentencePieceProcessor() + sp.Load('m.model') +- with codecs.open( +- os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: ++ with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: + for line in file: + sp.DecodePieces(sp.EncodeAsPieces(line)) + sp.DecodeIds(sp.EncodeAsIds(line)) + +- def test_train(self): ++ def test_train_iterator(self): + spm.SentencePieceTrainer.Train('--input=' + + os.path.join(data_dir, 'botchan.txt') + + ' --model_prefix=m --vocab_size=1000') + # Load as 'rb' for Python3.5/2.7. +- is1 = open(os.path.join(data_dir, 'botchan.txt'), 'rb') +- is2 = open(os.path.join(data_dir, 'botchan.txt'), 'rb') + os1 = io.BytesIO() + os2 = io.BytesIO() + ++ # suppress logging (redirect to /dev/null) + spm.SentencePieceTrainer.train( + input=os.path.join(data_dir, 'botchan.txt'), + model_prefix='m', +- vocab_size=1000) ++ vocab_size=1000, ++ logstream=open(os.devnull, 'w')) + +- spm.SentencePieceTrainer.train( +- sentence_iterator=is1, model_prefix='m', vocab_size=1000) ++ with open(os.path.join(data_dir, 'botchan.txt'), 'rb') as is1: ++ spm.SentencePieceTrainer.train( ++ sentence_iterator=is1, ++ model_prefix='m', ++ vocab_size=1000, ++ logstream=open(os.devnull, 'w')) + + spm.SentencePieceTrainer.train( + input=os.path.join(data_dir, 'botchan.txt'), + model_writer=os1, +- vocab_size=1000) ++ vocab_size=1000, ++ logstream=open(os.devnull, 'w')) + +- spm.SentencePieceTrainer.train( +- sentence_iterator=is2, model_writer=os2, vocab_size=1000) ++ with open(os.path.join(data_dir, 'botchan.txt'), 'rb') as is2: ++ spm.SentencePieceTrainer.train( ++ sentence_iterator=is2, ++ model_writer=os2, ++ vocab_size=1000, ++ logstream=open(os.devnull, 'w')) + + sp1 = spm.SentencePieceProcessor(model_proto=os1.getvalue()) + sp2 = spm.SentencePieceProcessor(model_proto=os2.getvalue()) +@@ -200,127 +244,37 @@ class TestSentencepieceProcessor(unittest.TestCase): + logstream=open(os.devnull, 'w')) + sp = spm.SentencePieceProcessor() + sp.Load('m.model') +- with codecs.open( ++ with open( + os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + for line in file: + sp.DecodePieces(sp.EncodeAsPieces(line)) + sp.DecodeIds(sp.EncodeAsIds(line)) + +- # snake case API. +- def test_load_snake(self): +- self.assertEqual(1000, self.sp_.get_piece_size()) +- self.assertEqual(0, self.sp_.piece_to_id('')) +- self.assertEqual(1, self.sp_.piece_to_id('')) +- self.assertEqual(2, self.sp_.piece_to_id('')) +- self.assertEqual('', self.sp_.id_to_piece(0)) +- self.assertEqual('', self.sp_.id_to_piece(1)) +- self.assertEqual('', self.sp_.id_to_piece(2)) +- for i in range(self.sp_.get_piece_size()): +- piece = self.sp_.id_to_piece(i) +- self.assertEqual(i, self.sp_.piece_to_id(piece)) +- +- def test_roundtrip_snake(self): +- text = 'I saw a girl with a telescope.' +- ids = self.sp_.encode_as_ids(text) +- pieces1 = self.sp_.encode_as_pieces(text) +- pieces2 = self.sp_.nbest_encode_as_pieces(text, 10)[0] +- self.assertEqual(pieces1, pieces2) +- self.assertEqual(text, self.sp_.decode_pieces(pieces1)) +- self.assertEqual(text, self.sp_.decode_ids(ids)) +- for n in range(100): +- self.assertEqual( +- text, +- self.sp_.decode_pieces( +- self.sp_.sample_encode_as_pieces(text, 64, 0.5))) +- self.assertEqual( +- text, +- self.sp_.decode_pieces( +- self.sp_.sample_encode_as_pieces(text, -1, 0.5))) +- self.assertEqual( +- text, +- self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, 64, 0.5))) +- self.assertEqual( +- text, +- self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, -1, 0.5))) +- +- def test_ja_load_snake(self): +- self.assertEqual(8000, self.jasp_.get_piece_size()) +- self.assertEqual(0, self.jasp_.piece_to_id('')) +- self.assertEqual(1, self.jasp_.piece_to_id('')) +- self.assertEqual(2, self.jasp_.piece_to_id('')) +- self.assertEqual('', self.jasp_.id_to_piece(0)) +- self.assertEqual('', self.jasp_.id_to_piece(1)) +- self.assertEqual('', self.jasp_.id_to_piece(2)) +- for i in range(self.jasp_.get_piece_size()): +- piece = self.jasp_.id_to_piece(i) +- self.assertEqual(i, self.jasp_.piece_to_id(piece)) +- +- def test_ja_roundtrip_snake(self): +- text = '清水寺は京都にある。' +- ids = self.jasp_.encode_as_ids(text) +- pieces1 = self.jasp_.encode_as_pieces(text) +- pieces2 = self.jasp_.nbest_encode_as_pieces(text, 10)[0] +- self.assertEqual(pieces1, pieces2) +- self.assertEqual(text, self.jasp_.decode_pieces(pieces1)) +- self.assertEqual(text, self.jasp_.decode_ids(ids)) +- for n in range(100): +- self.assertEqual( +- text, +- self.jasp_.decode_pieces( +- self.jasp_.sample_encode_as_pieces(text, 64, 0.5))) +- self.assertEqual( +- text, +- self.jasp_.decode_pieces( +- self.jasp_.sample_encode_as_pieces(text, -1, 0.5))) +- +- def test_unicode_roundtrip_snake(self): +- text = u'I saw a girl with a telescope.' +- ids = self.sp_.encode_as_ids(text) +- pieces = self.sp_.encode_as_pieces(text) +- self.assertEqual(text, self.sp_.decode_pieces(pieces)) +- # python2 returns `str`. +- if sys.version_info < (3, 0, 0): +- text = text.encode('utf-8') +- self.assertEqual(text, self.sp_.decode_ids(ids)) +- +- def test_unicode_ja_roundtrip_snake(self): +- text = u'清水寺は京都にある。' +- ids = self.jasp_.encode_as_ids(text) +- pieces = self.jasp_.encode_as_pieces(text) +- self.assertEqual(text, self.jasp_.decode_pieces(pieces)) +- # python2 returns `str`. +- if sys.version_info < (3, 0, 0): +- text = text.encode('utf-8') +- self.assertEqual(text, self.jasp_.decode_ids(ids)) +- +- def test_train_snake(self): +- spm.SentencePieceTrainer.train('--input=' + +- os.path.join(data_dir, 'botchan.txt') + +- ' --model_prefix=m --vocab_size=1000') +- sp = spm.SentencePieceProcessor() +- sp.load('m.model') +- with codecs.open( +- os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: +- for line in file: +- sp.decode_pieces(sp.encode_as_pieces(line)) +- sp.decode_ids(sp.encode_as_ids(line)) +- + def test_serialized_proto(self): +- text = u'I saw a girl with a telescope.' +- self.assertNotEqual('', self.sp_.EncodeAsSerializedProto(text)) +- self.assertNotEqual('', +- self.sp_.SampleEncodeAsSerializedProto(text, 10, 0.2)) +- self.assertNotEqual('', self.sp_.NBestEncodeAsSerializedProto(text, 10)) +- self.assertNotEqual('', +- self.sp_.DecodePiecesAsSerializedProto(['foo', 'bar'])) +- self.assertNotEqual('', self.sp_.DecodeIdsAsSerializedProto([20, 30])) +- self.assertNotEqual('', self.sp_.encode_as_serialized_proto(text)) +- self.assertNotEqual( +- '', self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2)) +- self.assertNotEqual('', self.sp_.nbest_encode_as_serialized_proto(text, 10)) +- self.assertNotEqual( +- '', self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar'])) +- self.assertNotEqual('', self.sp_.decode_ids_as_serialized_proto([20, 30])) ++ text = 'I saw a girl with a telescope.' ++ s1 = self.sp_.EncodeAsSerializedProto(text) ++ s2 = self.sp_.SampleEncodeAsSerializedProto(text, 10, 0.2) ++ s3 = self.sp_.NBestEncodeAsSerializedProto(text, 10) ++ s4 = self.sp_.DecodePiecesAsSerializedProto(['foo', 'bar']) ++ s5 = self.sp_.DecodeIdsAsSerializedProto([20, 30]) ++ ++ t1 = self.sp_.encode_as_serialized_proto(text) ++ t2 = self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2) ++ t3 = self.sp_.nbest_encode_as_serialized_proto(text, 10) ++ t4 = self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar']) ++ t5 = self.sp_.decode_ids_as_serialized_proto([20, 30]) ++ ++ self.assertEqual(type(s1), bytes) ++ self.assertEqual(type(s2), bytes) ++ self.assertEqual(type(t2), bytes) ++ self.assertEqual(type(s3), bytes) ++ self.assertEqual(type(s4), bytes) ++ self.assertEqual(type(s5), bytes) ++ ++ self.assertEqual(s1, t1) ++ self.assertEqual(s3, t3) ++ self.assertEqual(s4, t4) ++ self.assertEqual(s5, t5) + + def test_new_api(self): + sp = spm.SentencePieceProcessor( +@@ -331,19 +285,33 @@ class TestSentencepieceProcessor(unittest.TestCase): + ids2 = self.sp_.EncodeAsIds(text2) + pieces = self.sp_.EncodeAsPieces(text) + pieces2 = self.sp_.EncodeAsPieces(text2) +- self.assertEqual(sp.encode(text), ids) ++ protos = self.sp_.EncodeAsSerializedProto(text) ++ proto2 = self.sp_.EncodeAsSerializedProto(text2) ++ ++ self.assertEqual(sp.encode(text, out_type=int), ids) + self.assertEqual(sp.encode(text, out_type=str), pieces) ++ self.assertEqual(sp.encode(text, out_type='proto'), protos) ++ ++ self.assertEqual(sp.encode([text], out_type=int), [ids]) ++ self.assertEqual(sp.encode([text], out_type=str), [pieces]) ++ self.assertEqual(sp.encode([text], out_type='proto'), [protos]) ++ + detok_ids = self.sp_.DecodeIds(ids) + detok_pieces = self.sp_.DecodePieces(pieces) + self.assertEqual(sp.decode(ids), detok_ids) + self.assertEqual(sp.decode(pieces), detok_pieces) ++ self.assertEqual(sp.decode([]), '') ++ self.assertEqual(sp.decode([[]]), ['']) + + # add_bos, add_eos, reverse + self.assertEqual([sp.bos_id()] + ids, sp.encode(text, add_bos=True)) + self.assertEqual(ids + [sp.eos_id()], sp.encode(text, add_eos=True)) ++ self.assertEqual(ids + [sp.eos_id()], sp.EncodeAsIds(text, add_eos=True)) + rids = ids[:] + rids.reverse() ++ + self.assertEqual(rids, sp.encode(text, reverse=True)) ++ self.assertEqual(rids, sp.EncodeAsIds(text, reverse=True)) + + # different shape. + self.assertEqual([ids, ids2], sp.encode([text, text2])) +@@ -351,6 +319,29 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual([text, text2], sp.decode([ids, ids2])) + self.assertEqual([text, text2], sp.decode([pieces, pieces2])) + ++ pieces = list(reversed(self.sp_.EncodeAsPieces(text))) ++ self.assertEqual(pieces, sp.encode(text, reverse=True, out_type=str)) ++ ++ # emit unk piece ++ unk_char = '藤' ++ pieces = self.sp_.EncodeAsIds(unk_char, emit_unk_piece=True) ++ pieces2 = self.sp_.encode(unk_char, out_type=int, emit_unk_piece=True) ++ self.assertEqual(pieces[1], sp.unk_id()) ++ self.assertEqual(pieces2[1], sp.unk_id()) ++ self.assertEqual(pieces, pieces2) ++ ++ pieces = self.sp_.EncodeAsPieces(unk_char, emit_unk_piece=True) ++ pieces2 = self.sp_.encode(unk_char, out_type=str, emit_unk_piece=True) ++ self.assertEqual(pieces[1], '') ++ self.assertEqual(pieces2[1], '') ++ self.assertEqual(pieces, pieces2) ++ ++ pieces = self.sp_.EncodeAsPieces(unk_char, emit_unk_piece=False) ++ pieces2 = self.sp_.encode(unk_char, out_type=str, emit_unk_piece=False) ++ self.assertEqual(pieces[1], unk_char) ++ self.assertEqual(pieces2[1], unk_char) ++ self.assertEqual(pieces, pieces2) ++ + def test_new_api_init(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model'), +@@ -361,7 +352,10 @@ class TestSentencepieceProcessor(unittest.TestCase): + pieces = [''] + self.sp_.EncodeAsPieces(text) + [''] + self.assertEqual(pieces, sp.encode(text)) + +- def test_new_api_sampling(self): ++ pieces = self.sp_.EncodeAsPieces(text) + [''] ++ self.assertEqual(pieces, sp.encode(text, add_bos=False, add_eos=True)) ++ ++ def test_sampling(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model'), + out_type=str, +@@ -376,25 +370,35 @@ class TestSentencepieceProcessor(unittest.TestCase): + ++ids2[' '.join(sp.encode('hello world', enable_sampling=False))] + self.assertEqual(len(ids2), 1) + +- def test_new_api_nbest(self): ++ def test_nbest(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model')) +- results = sp.nbest_encode('hello world', nbest_size=10, out_type=str) ++ text = 'hello world' ++ results = sp.nbest_encode(text, nbest_size=10, out_type=str) ++ self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=str)) + for n in results: +- self.assertEqual(sp.decode(n), 'hello world') +- results = sp.nbest_encode('hello world', nbest_size=10, out_type=int) ++ self.assertEqual(sp.decode(n), text) ++ decoded = sp.decode(results) ++ for n in decoded: ++ self.assertEqual(n, text) ++ results = sp.nbest_encode(text, nbest_size=10, out_type=int) ++ self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=int)) + for n in results: +- self.assertEqual(sp.decode(n), 'hello world') ++ self.assertEqual(sp.decode(n), text) ++ decoded = sp.decode(results) ++ for n in decoded: ++ self.assertEqual(n, text) + +- def test_new_api_sample_and_score(self): ++ def test_sample_and_score(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model')) +- results = sp.sample_encode_and_score('hello world', wor=True, out_type=str) ++ text = 'hello world' ++ results = sp.sample_encode_and_score(text, wor=True, out_type=str) + for n in results: +- self.assertEqual(sp.decode(n[0]), 'hello world') +- results = sp.sample_encode_and_score('hello world', wor=True, out_type=int) ++ self.assertEqual(sp.decode(n[0]), text) ++ results = sp.sample_encode_and_score(text, wor=True, out_type=int) + for n in results: +- self.assertEqual(sp.decode(n[0]), 'hello world') ++ self.assertEqual(sp.decode(n[0]), text) + + def test_valid_range(self): + size = self.sp_.piece_size() +@@ -412,6 +416,82 @@ class TestSentencepieceProcessor(unittest.TestCase): + except: + self.assertTrue(True) + ++ def test_batch(self): ++ sp = spm.SentencePieceProcessor( ++ model_file=os.path.join('test', 'test_model.model')) ++ with open( ++ os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: ++ texts = file.readlines() ++ ++ r1 = sp.encode(texts, out_type=str, num_threads=None) ++ r2 = sp.encode(texts, out_type=str, num_threads=1) ++ r3 = sp.encode(texts, out_type=str, num_threads=-1) ++ r4 = sp.encode(texts, out_type=str, num_threads=8) ++ r5 = [sp.encode(s, out_type=str) for s in texts] ++ self.assertEqual(r1, r2) ++ self.assertEqual(r1, r3) ++ self.assertEqual(r1, r4) ++ self.assertEqual(r1, r5) ++ ++ d1 = sp.decode(r1, num_threads=None) ++ d2 = sp.decode(r2, num_threads=1) ++ d3 = sp.decode(r3, num_threads=-1) ++ d4 = sp.decode(r4, num_threads=8) ++ d5 = [sp.decode(s) for s in r5] ++ self.assertEqual(d1, d2) ++ self.assertEqual(d1, d3) ++ self.assertEqual(d1, d4) ++ self.assertEqual(d1, d5) ++ ++ r1 = sp.encode(texts, out_type=int, num_threads=None) ++ r2 = sp.encode(texts, out_type=int, num_threads=1) ++ r3 = sp.encode(texts, out_type=int, num_threads=-1) ++ r4 = sp.encode(texts, out_type=int, num_threads=8) ++ r5 = [sp.encode(s, out_type=int) for s in texts] ++ self.assertEqual(r1, r2) ++ self.assertEqual(r1, r3) ++ self.assertEqual(r1, r4) ++ self.assertEqual(r1, r5) ++ ++ d1 = sp.decode(r1, num_threads=None) ++ d2 = sp.decode(r2, num_threads=1) ++ d3 = sp.decode(r3, num_threads=-1) ++ d4 = sp.decode(r4, num_threads=8) ++ d5 = [sp.decode(s) for s in r5] ++ self.assertEqual(d1, d2) ++ self.assertEqual(d1, d3) ++ self.assertEqual(d1, d4) ++ self.assertEqual(d1, d5) ++ ++ r1 = sp.encode(texts, out_type='proto', num_threads=None) ++ r2 = sp.encode(texts, out_type='proto', num_threads=1) ++ r3 = sp.encode(texts, out_type='proto', num_threads=-1) ++ r4 = sp.encode(texts, out_type='proto', num_threads=8) ++ r5 = [sp.encode(s, out_type='proto') for s in texts] ++ self.assertEqual(r1, r2) ++ self.assertEqual(r1, r3) ++ self.assertEqual(r1, r4) ++ self.assertEqual(r1, r5) ++ ++ e1 = sp.calculate_entropy(texts, theta=1.0, num_threads=10) ++ e2 = sp.CalculateEntropy(texts, theta=1.0, num_threads=10) ++ e3 = [sp.calculate_entropy(s, theta=1.0) for s in texts] ++ self.assertEqual(e1, e2) ++ self.assertEqual(e1, e3) ++ ++ def test_pickle(self): ++ with open('sp.pickle', 'wb') as f: ++ pickle.dump(self.sp_, f) ++ ++ id1 = self.sp_.encode('hello world.', out_type=int) ++ ++ with open('sp.pickle', 'rb') as f: ++ sp = pickle.load(f) ++ ++ id2 = sp.encode('hello world.', out_type=int) ++ ++ self.assertEqual(id1, id2) ++ + + def suite(): + suite = unittest.TestSuite() diff --git a/patches/0002-remove-debug-symbols-from-wheel-package.patch b/patches/0002-remove-debug-symbols-from-wheel-package.patch new file mode 100644 index 0000000..fbbf19e --- /dev/null +++ b/patches/0002-remove-debug-symbols-from-wheel-package.patch @@ -0,0 +1,23 @@ +From: Taku Kudo +Date: Wed, 8 Jun 2022 16:38:21 +0900 +Subject: remove debug symbols from wheel package + +Signed-off-by: Kentaro Hayashi +--- + python/setup.py | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/python/setup.py b/python/setup.py +index cfbf0db..198cba7 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -93,6 +93,9 @@ class build_ext(_build_ext): + # See: https://github.com/neulab/xnmt/issues/199 + if sys.platform == 'darwin': + cflags.append('-mmacosx-version-min=10.9') ++ else: ++ cflags.append('-Wl,-strip-all') ++ libs.append('-Wl,-strip-all') + print('## cflags={}'.format(' '.join(cflags))) + print('## libs={}'.format(' '.join(libs))) + ext.extra_compile_args = cflags diff --git a/patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch b/patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch new file mode 100644 index 0000000..0febd09 --- /dev/null +++ b/patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch @@ -0,0 +1,71 @@ +From: Taku Kudo +Date: Mon, 13 Jun 2022 03:20:23 +0900 +Subject: allow tab character to be used in user_defined_symbols. + +Signed-off-by: Kentaro Hayashi +--- + src/trainer_interface.cc | 11 ++++++++++- + src/util.cc | 5 ++--- + 2 files changed, 12 insertions(+), 4 deletions(-) + +diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc +index ef0c370..5e26b75 100644 +--- a/src/trainer_interface.cc ++++ b/src/trainer_interface.cc +@@ -12,6 +12,8 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + ++#include "trainer_interface.h" ++ + #include + #include + #include +@@ -35,7 +37,6 @@ + #include "third_party/absl/strings/str_format.h" + #include "third_party/absl/strings/str_join.h" + #include "third_party/absl/strings/str_split.h" +-#include "trainer_interface.h" + #include "unicode_script.h" + #include "util.h" + +@@ -699,6 +700,14 @@ util::Status TrainerInterface::SaveVocab(absl::string_view filename) const { + auto output = filesystem::NewWritableFile(filename); + RETURN_IF_ERROR(output->status()); + ++ for (const auto &piece : model_proto.pieces()) { ++ if (piece.piece().find_first_of(" \t\r\n") != std::string::npos) { ++ LOG(WARNING) << "The piece [" << piece.piece() ++ << "] contains escaped characters that break the format of " ++ << filename; ++ } ++ } ++ + if (trainer_spec_.vocabulary_output_piece_score()) { + for (const auto &piece : model_proto.pieces()) { + std::ostringstream os; +diff --git a/src/util.cc b/src/util.cc +index 8424448..8da16c4 100644 +--- a/src/util.cc ++++ b/src/util.cc +@@ -12,10 +12,10 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + +-#include +- + #include "util.h" + ++#include ++ + namespace sentencepiece { + + namespace { +@@ -217,7 +217,6 @@ std::vector StrSplitAsCSV(absl::string_view text) { + + std::vector result; + for (; str < eos; ++str) { +- while (*str == ' ' || *str == '\t') ++str; + if (*str == '"') { + start = ++str; + end = start; diff --git a/patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch b/patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch new file mode 100644 index 0000000..96a6c9a --- /dev/null +++ b/patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch @@ -0,0 +1,45 @@ +From: Taku Kudo +Date: Mon, 13 Jun 2022 16:46:18 +0900 +Subject: add test to use tab as user defined symbols.. + +Signed-off-by: Kentaro Hayashi +--- + python/test/sentencepiece_test.py | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 99e36f3..6c48bcd 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -240,16 +240,18 @@ class TestSentencepieceProcessor(unittest.TestCase): + input=[os.path.join(data_dir, 'botchan.txt')], + model_prefix='m', + vocab_size=1002, +- user_defined_symbols=['foo', 'bar', ','], ++ user_defined_symbols=['foo', 'bar', ',', ' ', '\t', '\b', '\n', '\r'], + logstream=open(os.devnull, 'w')) + sp = spm.SentencePieceProcessor() + sp.Load('m.model') +- with open( +- os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: ++ with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: + for line in file: + sp.DecodePieces(sp.EncodeAsPieces(line)) + sp.DecodeIds(sp.EncodeAsIds(line)) + ++ s = 'hello\tworld\r\nthis\tis a \b pen' ++ self.assertEqual(s, sp.decode(sp.encode(s))) ++ + def test_serialized_proto(self): + text = 'I saw a girl with a telescope.' + s1 = self.sp_.EncodeAsSerializedProto(text) +@@ -419,8 +421,7 @@ class TestSentencepieceProcessor(unittest.TestCase): + def test_batch(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model')) +- with open( +- os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: ++ with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: + texts = file.readlines() + + r1 = sp.encode(texts, out_type=str, num_threads=None) diff --git a/patches/0005-Uses-C-17-by-default.patch b/patches/0005-Uses-C-17-by-default.patch new file mode 100644 index 0000000..88a8806 --- /dev/null +++ b/patches/0005-Uses-C-17-by-default.patch @@ -0,0 +1,915 @@ +From: Taku Kudo +Date: Tue, 14 Jun 2022 01:18:09 +0900 +Subject: Uses C++17 by default + +Signed-off-by: Kentaro Hayashi +--- + CMakeLists.txt | 4 +- + python/setup.py | 6 +- + src/CMakeLists.txt | 1 - + src/sentencepiece_processor.h | 26 +- + third_party/absl/strings/string_view.cc | 267 ----------------- + third_party/absl/strings/string_view.h | 508 -------------------------------- + 6 files changed, 9 insertions(+), 803 deletions(-) + delete mode 100644 third_party/absl/strings/string_view.cc + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a791f08..78379a3 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -28,7 +28,7 @@ option(SPM_NO_THREADLOCAL "Disable thread_local operator" OFF) + option(SPM_USE_BUILTIN_PROTOBUF "Use built-in protobuf" ON) + option(SPM_USE_EXTERNAL_ABSL "Use external abseil" OFF) + +-set(CMAKE_CXX_STANDARD 11) ++set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND +@@ -98,6 +98,8 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") + configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) + + if (NOT MSVC) ++ # suppress warning for C++11 features. ++# add_definitions("-Wno-deprecated-declarations -Wno-deprecated-enum-enum-conversion") + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + endif() + +diff --git a/python/setup.py b/python/setup.py +index 198cba7..fdf9394 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -58,7 +58,7 @@ def is_sentencepiece_installed(): + + + def get_cflags_and_libs(root): +- cflags = ['-std=c++11', '-I' + os.path.join(root, 'include')] ++ cflags = ['-std=c++17', '-I' + os.path.join(root, 'include')] + libs = [] + if os.path.exists(os.path.join(root, 'lib/pkgconfig/sentencepiece.pc')): + libs = [ +@@ -109,13 +109,13 @@ if os.name == 'nt': + if sys.maxsize > 2**32: + arch = 'amd64' + if os.path.exists('..\\build\\root_{}\\lib'.format(arch)): +- cflags = ['/MT', '/I..\\build\\root_{}\\include'.format(arch)] ++ cflags = ['/std:c++17', '/MT', '/I..\\build\\root_{}\\include'.format(arch)] + libs = [ + '..\\build\\root_{}\\lib\\sentencepiece.lib'.format(arch), + '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch) + ] + else: +- cflags = ['/MT', '/I..\\build\\root\\include'] ++ cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include'] + libs = [ + '..\\build\\root\\lib\\sentencepiece.lib', + '..\\build\\root\\lib\\sentencepiece_train.lib' +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 8b7fb76..6cb3922 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -25,7 +25,6 @@ if (SPM_USE_EXTERNAL_ABSL) + endif() + else() + set(ABSL_FLAGS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/absl/flags/flag.cc) +- set(ABSL_STRINGS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/absl/strings/string_view.cc) + endif() + + if (SPM_USE_BUILTIN_PROTOBUF) +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index 3f9c20d..9d38214 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -18,33 +18,13 @@ + #include + #include + #include ++#include + #include + #include + +-#if defined(_USE_INTERNAL_STRING_VIEW) +-#include "third_party/absl/strings/string_view.h" +-#elif defined(_USE_TF_STRING_VIEW) +-#include "absl/strings/string_view.h" +-#else +-// Minimum absl::string_view class that is used only for +-// the argument of public APIs. + namespace absl { +-class string_view { +- public: +- string_view() : ptr_(nullptr), length_(0) {} +- string_view(const std::string &str) : ptr_(str.data()), length_(str.size()) {} +- string_view(const char *str) : ptr_(str), length_(std::strlen(str)) {} +- string_view(const char *data, size_t len) : ptr_(data), length_(len) {} +- +- const char *data() const { return ptr_; } +- size_t size() const { return length_; } +- +- private: +- const char *ptr_ = nullptr; +- size_t length_ = 0; +-}; +-} // namespace absl +-#endif ++using std::string_view; ++} + + namespace sentencepiece { + +diff --git a/third_party/absl/strings/string_view.cc b/third_party/absl/strings/string_view.cc +deleted file mode 100644 +index dce208d..0000000 +--- a/third_party/absl/strings/string_view.cc ++++ /dev/null +@@ -1,267 +0,0 @@ +-// Copyright 2017 The Abseil Authors. +-// +-// Licensed under the Apache License, Version 2.0 (the "License"); +-// you may not use this file except in compliance with the License. +-// You may obtain a copy of the License at +-// +-// http://www.apache.org/licenses/LICENSE-2.0 +-// +-// Unless required by applicable law or agreed to in writing, software +-// distributed under the License is distributed on an "AS IS" BASIS, +-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-// See the License for the specific language governing permissions and +-// limitations under the License. +- +-#include "third_party/absl/strings/string_view.h" +- +-#ifndef ABSL_HAVE_STD_STRING_VIEW +- +-#include +-#include +-#include +-#include +- +-// #include "absl/strings/internal/memutil.h" +- +-namespace absl { +- +-namespace { +-void WritePadding(std::ostream& o, size_t pad) { +- char fill_buf[32]; +- memset(fill_buf, o.fill(), sizeof(fill_buf)); +- while (pad) { +- size_t n = std::min(pad, sizeof(fill_buf)); +- o.write(fill_buf, n); +- pad -= n; +- } +-} +- +-class LookupTable { +- public: +- // For each character in wanted, sets the index corresponding +- // to the ASCII code of that character. This is used by +- // the find_.*_of methods below to tell whether or not a character is in +- // the lookup table in constant time. +- explicit LookupTable(string_view wanted) { +- for (char c : wanted) { +- table_[Index(c)] = true; +- } +- } +- bool operator[](char c) const { return table_[Index(c)]; } +- +- private: +- static unsigned char Index(char c) { return static_cast(c); } +- bool table_[UCHAR_MAX + 1] = {}; +-}; +- +-} // namespace +- +-std::ostream& operator<<(std::ostream& o, string_view piece) { +- std::ostream::sentry sentry(o); +- if (sentry) { +- size_t lpad = 0; +- size_t rpad = 0; +- if (static_cast(o.width()) > piece.size()) { +- size_t pad = o.width() - piece.size(); +- if ((o.flags() & o.adjustfield) == o.left) { +- rpad = pad; +- } else { +- lpad = pad; +- } +- } +- if (lpad) WritePadding(o, lpad); +- o.write(piece.data(), piece.size()); +- if (rpad) WritePadding(o, rpad); +- o.width(0); +- } +- return o; +-} +- +-string_view::size_type string_view::copy(char* buf, size_type n, +- size_type pos) const { +- size_type ulen = length_; +- assert(pos <= ulen); +- size_type rlen = std::min(ulen - pos, n); +- if (rlen > 0) { +- const char* start = ptr_ + pos; +- std::copy(start, start + rlen, buf); +- } +- return rlen; +-} +- +-namespace { +-const char* memmatch(const char* phaystack, size_t haylen, const char* pneedle, +- size_t neelen) { +- if (0 == neelen) { +- return phaystack; // even if haylen is 0 +- } +- if (haylen < neelen) { +- return nullptr; +- } +- const char* match; +- const char* hayend = phaystack + haylen - neelen + 1; +- while ((match = (const char*)(memchr(phaystack, pneedle[0], +- hayend - phaystack)))) { +- if (memcmp(match, pneedle, neelen) == 0) { +- return match; +- } else { +- phaystack = match + 1; +- } +- } +- return nullptr; +-} +-} // namespace +- +-string_view::size_type string_view::find(string_view s, size_type pos) const +- noexcept { +- if (empty() || pos > length_) { +- if (empty() && pos == 0 && s.empty()) return 0; +- return npos; +- } +- const char* result = memmatch(ptr_ + pos, length_ - pos, s.ptr_, s.length_); +- return result ? result - ptr_ : npos; +-} +- +-string_view::size_type string_view::find(char c, size_type pos) const noexcept { +- if (empty() || pos >= length_) { +- return npos; +- } +- const char* result = +- static_cast(memchr(ptr_ + pos, c, length_ - pos)); +- return result != nullptr ? result - ptr_ : npos; +-} +- +-string_view::size_type string_view::rfind(string_view s, size_type pos) const +- noexcept { +- if (length_ < s.length_) return npos; +- if (s.empty()) return std::min(length_, pos); +- const char* last = ptr_ + std::min(length_ - s.length_, pos) + s.length_; +- const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); +- return result != last ? result - ptr_ : npos; +-} +- +-// Search range is [0..pos] inclusive. If pos == npos, search everything. +-string_view::size_type string_view::rfind(char c, size_type pos) const +- noexcept { +- // Note: memrchr() is not available on Windows. +- if (empty()) return npos; +- for (size_type i = std::min(pos, length_ - 1);; --i) { +- if (ptr_[i] == c) { +- return i; +- } +- if (i == 0) break; +- } +- return npos; +-} +- +-string_view::size_type string_view::find_first_of(string_view s, +- size_type pos) const +- noexcept { +- if (empty() || s.empty()) { +- return npos; +- } +- // Avoid the cost of LookupTable() for a single-character search. +- if (s.length_ == 1) return find_first_of(s.ptr_[0], pos); +- LookupTable tbl(s); +- for (size_type i = pos; i < length_; ++i) { +- if (tbl[ptr_[i]]) { +- return i; +- } +- } +- return npos; +-} +- +-string_view::size_type string_view::find_first_not_of(string_view s, +- size_type pos) const +- noexcept { +- if (empty()) return npos; +- // Avoid the cost of LookupTable() for a single-character search. +- if (s.length_ == 1) return find_first_not_of(s.ptr_[0], pos); +- LookupTable tbl(s); +- for (size_type i = pos; i < length_; ++i) { +- if (!tbl[ptr_[i]]) { +- return i; +- } +- } +- return npos; +-} +- +-string_view::size_type string_view::find_first_not_of(char c, +- size_type pos) const +- noexcept { +- if (empty()) return npos; +- for (; pos < length_; ++pos) { +- if (ptr_[pos] != c) { +- return pos; +- } +- } +- return npos; +-} +- +-string_view::size_type string_view::find_last_of(string_view s, +- size_type pos) const noexcept { +- if (empty() || s.empty()) return npos; +- // Avoid the cost of LookupTable() for a single-character search. +- if (s.length_ == 1) return find_last_of(s.ptr_[0], pos); +- LookupTable tbl(s); +- for (size_type i = std::min(pos, length_ - 1);; --i) { +- if (tbl[ptr_[i]]) { +- return i; +- } +- if (i == 0) break; +- } +- return npos; +-} +- +-string_view::size_type string_view::find_last_not_of(string_view s, +- size_type pos) const +- noexcept { +- if (empty()) return npos; +- size_type i = std::min(pos, length_ - 1); +- if (s.empty()) return i; +- // Avoid the cost of LookupTable() for a single-character search. +- if (s.length_ == 1) return find_last_not_of(s.ptr_[0], pos); +- LookupTable tbl(s); +- for (;; --i) { +- if (!tbl[ptr_[i]]) { +- return i; +- } +- if (i == 0) break; +- } +- return npos; +-} +- +-string_view::size_type string_view::find_last_not_of(char c, +- size_type pos) const +- noexcept { +- if (empty()) return npos; +- size_type i = std::min(pos, length_ - 1); +- for (;; --i) { +- if (ptr_[i] != c) { +- return i; +- } +- if (i == 0) break; +- } +- return npos; +-} +- +-// MSVC has non-standard behavior that implicitly creates definitions for static +-// const members. These implicit definitions conflict with explicit out-of-class +-// member definitions that are required by the C++ standard, resulting in +-// LNK1169 "multiply defined" errors at link time. __declspec(selectany) asks +-// MSVC to choose only one definition for the symbol it decorates. See details +-// at http://msdn.microsoft.com/en-us/library/34h23df8(v=vs.100).aspx +-#ifdef _MSC_VER +-#define ABSL_STRING_VIEW_SELECTANY __declspec(selectany) +-#else +-#define ABSL_STRING_VIEW_SELECTANY +-#endif +- +-ABSL_STRING_VIEW_SELECTANY +-constexpr string_view::size_type string_view::npos; +-ABSL_STRING_VIEW_SELECTANY +-constexpr string_view::size_type string_view::kMaxSize; +- +-} // namespace absl +- +-#endif // ABSL_HAVE_STD_STRING_VIEW +diff --git a/third_party/absl/strings/string_view.h b/third_party/absl/strings/string_view.h +index 68d46e3..9bb8b1c 100644 +--- a/third_party/absl/strings/string_view.h ++++ b/third_party/absl/strings/string_view.h +@@ -28,518 +28,10 @@ + #define ABSL_STRINGS_STRING_VIEW_H_ + + #include +-// #include "absl/base/config.h" +- +-#ifdef ABSL_HAVE_STD_STRING_VIEW +- + #include + + namespace absl { + using std::string_view; +-} // namespace absl +- +-#else // ABSL_HAVE_STD_STRING_VIEW +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef __has_builtin +-#define ABSL_HAVE_BUILTIN(x) __has_builtin(x) +-#else +-#define ABSL_HAVE_BUILTIN(x) 0 +-#endif +- +-// #include "absl/base/internal/throw_delegate.h" +-// #include "absl/base/macros.h" +-// #include "absl/base/port.h" +- +-namespace absl { +- +-// absl::string_view +-// +-// A `string_view` provides a lightweight view into the std::string data +-// provided by a `std::string`, double-quoted std::string literal, character +-// array, or even another `string_view`. A `string_view` does *not* own the +-// std::string to which it points, and that data cannot be modified through the +-// view. +-// +-// You can use `string_view` as a function or method parameter anywhere a +-// parameter can receive a double-quoted std::string literal, `const char*`, +-// `std::string`, or another `absl::string_view` argument with no need to copy +-// the std::string data. Systematic use of `string_view` within function +-// arguments reduces data copies and `strlen()` calls. +-// +-// Because of its small size, prefer passing `string_view` by value: +-// +-// void MyFunction(absl::string_view arg); +-// +-// If circumstances require, you may also pass one by const reference: +-// +-// void MyFunction(const absl::string_view& arg); // not preferred +-// +-// Passing by value generates slightly smaller code for many architectures. +-// +-// In either case, the source data of the `string_view` must outlive the +-// `string_view` itself. +-// +-// A `string_view` is also suitable for local variables if you know that the +-// lifetime of the underlying object is longer than the lifetime of your +-// `string_view` variable. However, beware of binding a `string_view` to a +-// temporary value: +-// +-// // BAD use of string_view: lifetime problem +-// absl::string_view sv = obj.ReturnAString(); +-// +-// // GOOD use of string_view: str outlives sv +-// std::string str = obj.ReturnAString(); +-// absl::string_view sv = str; +-// +-// Due to lifetime issues, a `string_view` is sometimes a poor choice for a +-// return value and usually a poor choice for a data member. If you do use a +-// `string_view` this way, it is your responsibility to ensure that the object +-// pointed to by the `string_view` outlives the `string_view`. +-// +-// A `string_view` may represent a whole std::string or just part of a +-// std::string. For example, when splitting a std::string, +-// `std::vector` is a natural data type for the output. +-// +-// +-// When constructed from a source which is nul-terminated, the `string_view` +-// itself will not include the nul-terminator unless a specific size (including +-// the nul) is passed to the constructor. As a result, common idioms that work +-// on nul-terminated strings do not work on `string_view` objects. If you write +-// code that scans a `string_view`, you must check its length rather than test +-// for nul, for example. Note, however, that nuls may still be embedded within +-// a `string_view` explicitly. +-// +-// You may create a null `string_view` in two ways: +-// +-// absl::string_view sv(); +-// absl::string_view sv(nullptr, 0); +-// +-// For the above, `sv.data() == nullptr`, `sv.length() == 0`, and +-// `sv.empty() == true`. Also, if you create a `string_view` with a non-null +-// pointer then `sv.data() != nullptr`. Thus, you can use `string_view()` to +-// signal an undefined value that is different from other `string_view` values +-// in a similar fashion to how `const char* p1 = nullptr;` is different from +-// `const char* p2 = "";`. However, in practice, it is not recommended to rely +-// on this behavior. +-// +-// Be careful not to confuse a null `string_view` with an empty one. A null +-// `string_view` is an empty `string_view`, but some empty `string_view`s are +-// not null. Prefer checking for emptiness over checking for null. +-// +-// There are many ways to create an empty string_view: +-// +-// const char* nullcp = nullptr; +-// // string_view.size() will return 0 in all cases. +-// absl::string_view(); +-// absl::string_view(nullcp, 0); +-// absl::string_view(""); +-// absl::string_view("", 0); +-// absl::string_view("abcdef", 0); +-// absl::string_view("abcdef" + 6, 0); +-// +-// All empty `string_view` objects whether null or not, are equal: +-// +-// absl::string_view() == absl::string_view("", 0) +-// absl::string_view(nullptr, 0) == absl:: string_view("abcdef"+6, 0) +-class string_view { +- public: +- using traits_type = std::char_traits; +- using value_type = char; +- using pointer = char*; +- using const_pointer = const char*; +- using reference = char&; +- using const_reference = const char&; +- using const_iterator = const char*; +- using iterator = const_iterator; +- using const_reverse_iterator = std::reverse_iterator; +- using reverse_iterator = const_reverse_iterator; +- using size_type = size_t; +- using difference_type = std::ptrdiff_t; +- +- static constexpr size_type npos = static_cast(-1); +- +- // Null `string_view` constructor +- constexpr string_view() noexcept : ptr_(nullptr), length_(0) {} +- +- // Implicit constructors +- +- template +- string_view( // NOLINT(runtime/explicit) +- const std::basic_string, Allocator>& +- str) noexcept +- : ptr_(str.data()), length_(CheckLengthInternal(str.size())) {} +- +- // Implicit constructor of a `string_view` from nul-terminated `str`. When +- // accepting possibly null strings, use `absl::NullSafeStringView(str)` +- // instead (see below). +- constexpr string_view(const char* str) // NOLINT(runtime/explicit) +- : ptr_(str), length_(CheckLengthInternal(StrLenInternal(str))) {} +- +- // Implicit constructor of a `string_view` from a `const char*` and length. +- constexpr string_view(const char* data, size_type len) +- : ptr_(data), length_(CheckLengthInternal(len)) {} +- +- // NOTE: Harmlessly omitted to work around gdb bug. +- // constexpr string_view(const string_view&) noexcept = default; +- // string_view& operator=(const string_view&) noexcept = default; +- +- // Iterators +- +- // string_view::begin() +- // +- // Returns an iterator pointing to the first character at the beginning of the +- // `string_view`, or `end()` if the `string_view` is empty. +- constexpr const_iterator begin() const noexcept { return ptr_; } +- +- // string_view::end() +- // +- // Returns an iterator pointing just beyond the last character at the end of +- // the `string_view`. This iterator acts as a placeholder; attempting to +- // access it results in undefined behavior. +- constexpr const_iterator end() const noexcept { return ptr_ + length_; } +- +- // string_view::cbegin() +- // +- // Returns a const iterator pointing to the first character at the beginning +- // of the `string_view`, or `end()` if the `string_view` is empty. +- constexpr const_iterator cbegin() const noexcept { return begin(); } +- +- // string_view::cend() +- // +- // Returns a const iterator pointing just beyond the last character at the end +- // of the `string_view`. This pointer acts as a placeholder; attempting to +- // access its element results in undefined behavior. +- constexpr const_iterator cend() const noexcept { return end(); } +- +- // string_view::rbegin() +- // +- // Returns a reverse iterator pointing to the last character at the end of the +- // `string_view`, or `rend()` if the `string_view` is empty. +- const_reverse_iterator rbegin() const noexcept { +- return const_reverse_iterator(end()); +- } +- +- // string_view::rend() +- // +- // Returns a reverse iterator pointing just before the first character at the +- // beginning of the `string_view`. This pointer acts as a placeholder; +- // attempting to access its element results in undefined behavior. +- const_reverse_iterator rend() const noexcept { +- return const_reverse_iterator(begin()); +- } +- +- // string_view::crbegin() +- // +- // Returns a const reverse iterator pointing to the last character at the end +- // of the `string_view`, or `crend()` if the `string_view` is empty. +- const_reverse_iterator crbegin() const noexcept { return rbegin(); } +- +- // string_view::crend() +- // +- // Returns a const reverse iterator pointing just before the first character +- // at the beginning of the `string_view`. This pointer acts as a placeholder; +- // attempting to access its element results in undefined behavior. +- const_reverse_iterator crend() const noexcept { return rend(); } +- +- // Capacity Utilities +- +- // string_view::size() +- // +- // Returns the number of characters in the `string_view`. +- constexpr size_type size() const noexcept { return length_; } +- +- // string_view::length() +- // +- // Returns the number of characters in the `string_view`. Alias for `size()`. +- constexpr size_type length() const noexcept { return size(); } +- +- // string_view::max_size() +- // +- // Returns the maximum number of characters the `string_view` can hold. +- constexpr size_type max_size() const noexcept { return kMaxSize; } +- +- // string_view::empty() +- // +- // Checks if the `string_view` is empty (refers to no characters). +- constexpr bool empty() const noexcept { return length_ == 0; } +- +- // std::string:view::operator[] +- // +- // Returns the ith element of an `string_view` using the array operator. +- // Note that this operator does not perform any bounds checking. +- constexpr const_reference operator[](size_type i) const { return ptr_[i]; } +- +- // string_view::front() +- // +- // Returns the first element of a `string_view`. +- constexpr const_reference front() const { return ptr_[0]; } +- +- // string_view::back() +- // +- // Returns the last element of a `string_view`. +- constexpr const_reference back() const { return ptr_[size() - 1]; } +- +- // string_view::data() +- // +- // Returns a pointer to the underlying character array (which is of course +- // stored elsewhere). Note that `string_view::data()` may contain embedded nul +- // characters, but the returned buffer may or may not be nul-terminated; +- // therefore, do not pass `data()` to a routine that expects a nul-terminated +- // std::string. +- constexpr const_pointer data() const noexcept { return ptr_; } +- +- // Modifiers +- +- // string_view::remove_prefix() +- // +- // Removes the first `n` characters from the `string_view`. Note that the +- // underlying std::string is not changed, only the view. +- void remove_prefix(size_type n) { +- assert(n <= length_); +- ptr_ += n; +- length_ -= n; +- } +- +- // string_view::remove_suffix() +- // +- // Removes the last `n` characters from the `string_view`. Note that the +- // underlying std::string is not changed, only the view. +- void remove_suffix(size_type n) { +- assert(n <= length_); +- length_ -= n; +- } +- +- // string_view::swap() +- // +- // Swaps this `string_view` with another `string_view`. +- void swap(string_view& s) noexcept { +- auto t = *this; +- *this = s; +- s = t; +- } +- +- // Explicit conversion operators +- +- // Converts to `std::basic_string`. +- template +- explicit operator std::basic_string() const { +- if (!data()) return {}; +- return std::basic_string(data(), size()); +- } +- +- // string_view::copy() +- // +- // Copies the contents of the `string_view` at offset `pos` and length `n` +- // into `buf`. +- size_type copy(char* buf, size_type n, size_type pos = 0) const; +- +- // string_view::substr() +- // +- // Returns a "substring" of the `string_view` (at offset `pos` and length +- // `n`) as another string_view. This function throws `std::out_of_bounds` if +- // `pos > size'. +- string_view substr(size_type pos, size_type n = npos) const { +- n = std::min(n, length_ - pos); +- return string_view(ptr_ + pos, n); +- } +- +- // string_view::compare() +- // +- // Performs a lexicographical comparison between the `string_view` and +- // another `absl::string_view), returning -1 if `this` is less than, 0 if +- // `this` is equal to, and 1 if `this` is greater than the passed std::string +- // view. Note that in the case of data equality, a further comparison is made +- // on the respective sizes of the two `string_view`s to determine which is +- // smaller, equal, or greater. +- int compare(string_view x) const noexcept { +- auto min_length = std::min(length_, x.length_); +- if (min_length > 0) { +- int r = memcmp(ptr_, x.ptr_, min_length); +- if (r < 0) return -1; +- if (r > 0) return 1; +- } +- if (length_ < x.length_) return -1; +- if (length_ > x.length_) return 1; +- return 0; +- } +- +- // Overload of `string_view::compare()` for comparing a substring of the +- // 'string_view` and another `absl::string_view`. +- int compare(size_type pos1, size_type count1, string_view v) const { +- return substr(pos1, count1).compare(v); +- } +- +- // Overload of `string_view::compare()` for comparing a substring of the +- // `string_view` and a substring of another `absl::string_view`. +- int compare(size_type pos1, size_type count1, string_view v, size_type pos2, +- size_type count2) const { +- return substr(pos1, count1).compare(v.substr(pos2, count2)); +- } +- +- // Overload of `string_view::compare()` for comparing a `string_view` and a +- // a different C-style std::string `s`. +- int compare(const char* s) const { return compare(string_view(s)); } +- +- // Overload of `string_view::compare()` for comparing a substring of the +- // `string_view` and a different std::string C-style std::string `s`. +- int compare(size_type pos1, size_type count1, const char* s) const { +- return substr(pos1, count1).compare(string_view(s)); +- } +- +- // Overload of `string_view::compare()` for comparing a substring of the +- // `string_view` and a substring of a different C-style std::string `s`. +- int compare(size_type pos1, size_type count1, const char* s, +- size_type count2) const { +- return substr(pos1, count1).compare(string_view(s, count2)); +- } +- +- // Find Utilities +- +- // string_view::find() +- // +- // Finds the first occurrence of the substring `s` within the `string_view`, +- // returning the position of the first character's match, or `npos` if no +- // match was found. +- size_type find(string_view s, size_type pos = 0) const noexcept; +- +- // Overload of `string_view::find()` for finding the given character `c` +- // within the `string_view`. +- size_type find(char c, size_type pos = 0) const noexcept; +- +- // string_view::rfind() +- // +- // Finds the last occurrence of a substring `s` within the `string_view`, +- // returning the position of the first character's match, or `npos` if no +- // match was found. +- size_type rfind(string_view s, size_type pos = npos) const noexcept; +- +- // Overload of `string_view::rfind()` for finding the last given character `c` +- // within the `string_view`. +- size_type rfind(char c, size_type pos = npos) const noexcept; +- +- // string_view::find_first_of() +- // +- // Finds the first occurrence of any of the characters in `s` within the +- // `string_view`, returning the start position of the match, or `npos` if no +- // match was found. +- size_type find_first_of(string_view s, size_type pos = 0) const noexcept; +- +- // Overload of `string_view::find_first_of()` for finding a character `c` +- // within the `string_view`. +- size_type find_first_of(char c, size_type pos = 0) const noexcept { +- return find(c, pos); +- } +- +- // string_view::find_last_of() +- // +- // Finds the last occurrence of any of the characters in `s` within the +- // `string_view`, returning the start position of the match, or `npos` if no +- // match was found. +- size_type find_last_of(string_view s, size_type pos = npos) const noexcept; +- +- // Overload of `string_view::find_last_of()` for finding a character `c` +- // within the `string_view`. +- size_type find_last_of(char c, size_type pos = npos) const noexcept { +- return rfind(c, pos); +- } +- +- // string_view::find_first_not_of() +- // +- // Finds the first occurrence of any of the characters not in `s` within the +- // `string_view`, returning the start position of the first non-match, or +- // `npos` if no non-match was found. +- size_type find_first_not_of(string_view s, size_type pos = 0) const noexcept; +- +- // Overload of `string_view::find_first_not_of()` for finding a character +- // that is not `c` within the `string_view`. +- size_type find_first_not_of(char c, size_type pos = 0) const noexcept; +- +- // string_view::find_last_not_of() +- // +- // Finds the last occurrence of any of the characters not in `s` within the +- // `string_view`, returning the start position of the last non-match, or +- // `npos` if no non-match was found. +- size_type find_last_not_of(string_view s, +- size_type pos = npos) const noexcept; +- +- // Overload of `string_view::find_last_not_of()` for finding a character +- // that is not `c` within the `string_view`. +- size_type find_last_not_of(char c, size_type pos = npos) const noexcept; +- +- private: +- static constexpr size_type kMaxSize = +- std::numeric_limits::max(); +- +- // check whether __builtin_strlen is provided by the compiler. +- // GCC doesn't have __has_builtin() +- // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66970), +- // but has __builtin_strlen according to +- // https://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/Other-Builtins.html. +-#if ABSL_HAVE_BUILTIN(__builtin_strlen) || \ +- (defined(__GNUC__) && !defined(__clang__)) +- static constexpr size_type StrLenInternal(const char* str) { +- return str ? __builtin_strlen(str) : 0; +- } +-#else +- static constexpr size_type StrLenInternal(const char* str) { +- return str ? strlen(str) : 0; +- } +-#endif +- +- static constexpr size_type CheckLengthInternal(size_type len) { return len; } +- +- const char* ptr_; +- size_type length_; +-}; +- +-// This large function is defined inline so that in a fairly common case where +-// one of the arguments is a literal, the compiler can elide a lot of the +-// following comparisons. +-inline bool operator==(string_view x, string_view y) noexcept { +- auto len = x.size(); +- if (len != y.size()) { +- return false; +- } +- return x.data() == y.data() || len <= 0 || +- memcmp(x.data(), y.data(), len) == 0; +-} +- +-inline bool operator!=(string_view x, string_view y) noexcept { +- return !(x == y); +-} +- +-inline bool operator<(string_view x, string_view y) noexcept { +- auto min_size = std::min(x.size(), y.size()); +- const int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); +- return (r < 0) || (r == 0 && x.size() < y.size()); +-} +- +-inline bool operator>(string_view x, string_view y) noexcept { return y < x; } +- +-inline bool operator<=(string_view x, string_view y) noexcept { +- return !(y < x); +-} +- +-inline bool operator>=(string_view x, string_view y) noexcept { +- return !(x < y); +-} +- +-// IO Insertion Operator +-std::ostream& operator<<(std::ostream& o, string_view piece); +- +-} // namespace absl +- +-#endif // ABSL_HAVE_STD_STRING_VIEW +- +-namespace absl { + + // ClippedSubstr() + // diff --git a/patches/0006-Uses-std-atomic-to-define-global-variable.patch b/patches/0006-Uses-std-atomic-to-define-global-variable.patch new file mode 100644 index 0000000..27ba620 --- /dev/null +++ b/patches/0006-Uses-std-atomic-to-define-global-variable.patch @@ -0,0 +1,73 @@ +From: Taku Kudo +Date: Tue, 14 Jun 2022 02:00:43 +0900 +Subject: Uses std::atomic to define global variable + +Signed-off-by: Kentaro Hayashi +--- + src/common.h | 13 ------------- + src/util.cc | 13 +++++++------ + 2 files changed, 7 insertions(+), 19 deletions(-) + +diff --git a/src/common.h b/src/common.h +index 7595634..6ec4c09 100644 +--- a/src/common.h ++++ b/src/common.h +@@ -50,19 +50,6 @@ typedef uint32_t char32; + typedef uint32_t uint32; + typedef uint64_t uint64; + +-static constexpr uint8 kuint8max = ((uint8)0xFF); +-static constexpr uint16 kuint16max = ((uint16)0xFFFF); +-static constexpr uint32 kuint32max = ((uint32)0xFFFFFFFF); +-static constexpr uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFF)); +-static constexpr int8 kint8min = ((int8)~0x7F); +-static constexpr int8 kint8max = ((int8)0x7F); +-static constexpr int16 kint16min = ((int16)~0x7FFF); +-static constexpr int16 kint16max = ((int16)0x7FFF); +-static constexpr int32 kint32min = ((int32)~0x7FFFFFFF); +-static constexpr int32 kint32max = ((int32)0x7FFFFFFF); +-static constexpr int64 kint64min = ((int64)(~0x7FFFFFFFFFFFFFFF)); +-static constexpr int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFF)); +- + static constexpr uint32 kUnicodeError = 0xFFFD; + + #if defined(OS_WIN) && defined(UNICODE) && defined(_UNICODE) +diff --git a/src/util.cc b/src/util.cc +index 8da16c4..f99c73a 100644 +--- a/src/util.cc ++++ b/src/util.cc +@@ -14,27 +14,28 @@ + + #include "util.h" + ++#include + #include + + namespace sentencepiece { + + namespace { + constexpr unsigned int kDefaultSeed = static_cast(-1); +-static unsigned int g_seed = kDefaultSeed; +-static int g_minloglevel = 0; ++static std::atomic g_seed = kDefaultSeed; ++static std::atomic g_minloglevel = 0; + } // namespace + + void SetRandomGeneratorSeed(unsigned int seed) { +- if (seed != kDefaultSeed) g_seed = seed; ++ if (seed != kDefaultSeed) g_seed.store(seed); + } + + uint32 GetRandomGeneratorSeed() { +- return g_seed == kDefaultSeed ? std::random_device{}() : g_seed; ++ return g_seed == kDefaultSeed ? std::random_device{}() : g_seed.load(); + } + + namespace logging { +-int GetMinLogLevel() { return g_minloglevel; } +-void SetMinLogLevel(int v) { g_minloglevel = v; } ++int GetMinLogLevel() { return g_minloglevel.load(); } ++void SetMinLogLevel(int v) { g_minloglevel.store(v); } + } // namespace logging + + namespace string_util { diff --git a/patches/0007-Fix-a-typo.patch b/patches/0007-Fix-a-typo.patch new file mode 100644 index 0000000..3c7cfc6 --- /dev/null +++ b/patches/0007-Fix-a-typo.patch @@ -0,0 +1,30 @@ +From: Kentaro Hayashi +Date: Tue, 14 Jun 2022 20:40:59 +0900 +Subject: Fix a typo + +gu rantees -> +guarantees + +Signed-off-by: Kentaro Hayashi +--- + src/trainer_interface.cc | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc +index 5e26b75..7270f29 100644 +--- a/src/trainer_interface.cc ++++ b/src/trainer_interface.cc +@@ -460,11 +460,11 @@ END: + } + if (trainer_spec_.differential_privacy_noise_level() <= 0) { + LOG(WARNING) << "Private version with <=0 noise level will give " +- "infinity epsilon gurantees."; ++ "infinity epsilon guarantees."; + } + if (trainer_spec_.differential_privacy_clipping_threshold() <= 0) { + LOG(WARNING) << "Private version with <=0 clipping threshold will give " +- "infinity epsilon gurantees."; ++ "infinity epsilon guarantees."; + } + + // Add noise to all the sentences via threadpool. diff --git a/patches/0008-Uses-absl-string_view-as-much-as-possible.patch b/patches/0008-Uses-absl-string_view-as-much-as-possible.patch new file mode 100644 index 0000000..e1d652f --- /dev/null +++ b/patches/0008-Uses-absl-string_view-as-much-as-possible.patch @@ -0,0 +1,1595 @@ +From: Taku Kudo +Date: Wed, 15 Jun 2022 01:29:55 +0900 +Subject: Uses absl::string_view as much as possible + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/__init__.py | 4 +- + python/src/sentencepiece/sentencepiece.i | 92 ++----- + python/src/sentencepiece/sentencepiece_wrap.cxx | 329 ++++++++++++++++-------- + src/builder.cc | 2 +- + src/builder.h | 2 +- + src/common.h | 3 +- + src/error.cc | 9 +- + src/sentencepiece_processor.cc | 36 ++- + src/sentencepiece_processor.h | 28 +- + src/sentencepiece_trainer.h | 8 +- + src/spec_parser.h | 16 +- + src/spm_encode_main.cc | 22 +- + src/util.cc | 27 +- + 13 files changed, 333 insertions(+), 245 deletions(-) + +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index cba3b70..1543d32 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -93,8 +93,8 @@ class SentencePieceProcessor(object): + def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): + return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) + +- def CalculateEntropy(self, text, theta): +- return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, text, theta) ++ def CalculateEntropy(self, *args): ++ return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args) + + def GetPieceSize(self): + return _sentencepiece.SentencePieceProcessor_GetPieceSize(self) +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 3a822bc..40373ce 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -37,6 +37,7 @@ class PyInputString { + str_ = nullptr; + } + } ++ absl::string_view str() const { return absl::string_view(data(), size()); } + const char* data() const { return str_; } + Py_ssize_t size() const { return size_; } + bool IsAvalable() const { return str_ != nullptr; } +@@ -179,7 +180,7 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { + } + } + +-inline void CheckIds(const std::vector &ids, int num_pieces) {} ++inline void CheckIds(const std::vector &ids, int num_pieces) {} + + class ThreadPool { + public: +@@ -266,6 +267,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %ignore sentencepiece::util::Status; + %ignore sentencepiece::util::StatusCode; + %ignore absl::string_view; ++%ignore std::string_view; + %ignore sentencepiece::SentencePieceText; + %ignore sentencepiece::NormalizerSpec; + %ignore sentencepiece::TrainerSpec; +@@ -386,7 +388,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return $self->DecodeIds(ids); + } + +- std::string _DecodePieces(const std::vector &pieces) const { ++ std::string _DecodePieces(const std::vector &pieces) const { + return $self->DecodePieces(pieces); + } + +@@ -397,7 +399,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + + sentencepiece::util::bytes _DecodePiecesAsSerializedProto( +- const std::vector &pieces) const { ++ const std::vector &pieces) const { + CheckIds(pieces, $self->GetPieceSize()); + return $self->DecodePiecesAsSerializedProto(pieces); + } +@@ -416,12 +418,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + + std::vector _DecodePiecesBatch( +- const std::vector> &ins, int num_threads) const { ++ const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); + } + + BytesArray _DecodePiecesAsSerializedProtoBatch( +- const std::vector> &ins, int num_threads) const { ++ const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, + sentencepiece::util::bytes); + } +@@ -1029,14 +1031,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %typemap(out) std::vector { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +- PyList_SetItem($result, i, PyInt_FromLong(static_cast($1[i]))); ++ PyList_SET_ITEM($result, i, PyInt_FromLong(static_cast($1[i]))); + } + } + + %typemap(out) std::vector { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +- PyList_SetItem($result, i, PyFloat_FromDouble(static_cast($1[i]))); ++ PyList_SET_ITEM($result, i, PyFloat_FromDouble(static_cast($1[i]))); + } + } + +@@ -1045,9 +1047,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t i = 0; i < $1.size(); ++i) { + PyObject *obj = PyList_New($1[i].size()); + for (size_t j = 0; j < $1[i].size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast($1[i][j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast($1[i][j]))); + } +- PyList_SetItem($result, i, obj); ++ PyList_SET_ITEM($result, i, obj); + } + } + +@@ -1055,14 +1057,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + PyObject *input_type = resultobj; + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +- PyList_SetItem($result, i, MakePyOutputString($1[i], input_type)); ++ PyList_SET_ITEM($result, i, MakePyOutputString($1[i], input_type)); + } + } + + %typemap(out) BytesArray { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +- PyList_SetItem($result, i, MakePyOutputBytes($1[i])); ++ PyList_SET_ITEM($result, i, MakePyOutputBytes($1[i])); + } + } + +@@ -1072,9 +1074,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t i = 0; i < $1.size(); ++i) { + PyObject *obj = PyList_New($1[i].size()); + for (size_t j = 0; j < $1[i].size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString($1[i][j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString($1[i][j], input_type)); + } +- PyList_SetItem($result, i, obj); ++ PyList_SET_ITEM($result, i, obj); + } + } + +@@ -1118,51 +1120,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + SWIG_fail; + } + resultobj = ustring.input_type(); +- $1 = absl::string_view(ustring.data(), ustring.size()); +-} +- +-%typemap(in) const std::vector& { +- std::vector *out = nullptr; +- if (PyList_Check($input)) { +- const size_t size = PyList_Size($input); +- out = new std::vector(size); +- for (size_t i = 0; i < size; ++i) { +- const PyInputString ustring(PyList_GetItem($input, i)); +- if (ustring.IsAvalable()) { +- (*out)[i].assign(ustring.data(), ustring.size()); +- } else { +- PyErr_SetString(PyExc_TypeError, "list must contain strings"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- } +- } else { +- PyErr_SetString(PyExc_TypeError, "not a list"); +- SWIG_fail; +- } +- $1 = out; +-} +- +-%typemap(in) const std::vector& { +- std::vector *out = nullptr; +- if (PyList_Check($input)) { +- const size_t size = PyList_Size($input); +- out = new std::vector(size); +- for (size_t i = 0; i < size; ++i) { +- const PyInputString ustring(PyList_GetItem($input, i)); +- if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); +- } else { +- PyErr_SetString(PyExc_TypeError, "list must contain strings"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- } +- } else { +- PyErr_SetString(PyExc_TypeError, "not a list"); +- SWIG_fail; +- } +- $1 = out; ++ $1 = ustring.str(); + } + + %typemap(in) const std::vector& { +@@ -1173,7 +1131,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem($input, i)); + if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -1208,11 +1166,11 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + $1 = out; + } + +-%typemap(in) const std::vector>& { +- std::vector> *out = nullptr; ++%typemap(in) const std::vector>& { ++ std::vector> *out = nullptr; + if (PyList_Check($input)) { + const size_t size = PyList_Size($input); +- out = new std::vector>(size); ++ out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem($input, i); + if (PyList_Check(o)) { +@@ -1221,7 +1179,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { +- (*out)[i][j].assign(ustring.data(), ustring.size()); ++ (*out)[i][j] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; +@@ -1302,9 +1260,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t i = 0; i < $1.size(); ++i) { + PyObject *obj = PyList_New($1[i].first.size()); + for (size_t j = 0; j < $1[i].first.size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString($1[i].first[j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString($1[i].first[j], input_type)); + } +- PyList_SetItem($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); ++ PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); + } + } + +@@ -1313,9 +1271,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (size_t i = 0; i < $1.size(); ++i) { + PyObject *obj = PyList_New($1[i].first.size()); + for (size_t j = 0; j < $1[i].first.size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast($1[i].first[j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast($1[i].first[j]))); + } +- PyList_SetItem($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); ++ PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast($1[i].second)))); + } + } + +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 6df3880..36ce38c 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -2693,16 +2693,16 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO + /* -------- TYPES TABLE (BEGIN) -------- */ + + #define SWIGTYPE_p_char swig_types[0] +-#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[1] +-#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[2] +-#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[3] +-#define SWIGTYPE_p_std__string swig_types[4] +-#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[5] +-#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[6] +-#define SWIGTYPE_p_std__vectorT_int_t swig_types[7] +-#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[8] +-#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[9] +-#define SWIGTYPE_p_std__vectorT_std__vectorT_std__string_t_t swig_types[10] ++#define SWIGTYPE_p_float swig_types[1] ++#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[2] ++#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[3] ++#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[4] ++#define SWIGTYPE_p_std__string swig_types[5] ++#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[6] ++#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[7] ++#define SWIGTYPE_p_std__vectorT_int_t swig_types[8] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[9] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[10] + static swig_type_info *swig_types[12]; + static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; + #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) +@@ -2843,6 +2843,7 @@ class PyInputString { + str_ = nullptr; + } + } ++ absl::string_view str() const { return absl::string_view(data(), size()); } + const char* data() const { return str_; } + Py_ssize_t size() const { return size_; } + bool IsAvalable() const { return str_ != nullptr; } +@@ -2985,7 +2986,7 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { + } + } + +-inline void CheckIds(const std::vector &ids, int num_pieces) {} ++inline void CheckIds(const std::vector &ids, int num_pieces) {} + + class ThreadPool { + public: +@@ -3473,14 +3474,14 @@ SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencep + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIds(ids); + } +-SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ ++SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ + return self->DecodePieces(pieces); + } + SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIdsAsSerializedProto(ids); + } +-SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ + CheckIds(pieces, self->GetPieceSize()); + return self->DecodePiecesAsSerializedProto(pieces); + } +@@ -3491,10 +3492,10 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodeIdsAsSerialize + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, + sentencepiece::util::bytes); + } +-SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ ++SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); + } +-SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ ++SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, + sentencepiece::util::bytes); + } +@@ -3718,7 +3719,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromSerializedProto(PyObje + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + { + try { +@@ -3763,7 +3764,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetEncodeExtraOptions(PyObject + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + { + try { +@@ -3808,7 +3809,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetDecodeExtraOptions(PyObject + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + { + try { +@@ -3834,7 +3835,7 @@ fail: + SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::string > *arg2 = 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[2] ; +@@ -3847,14 +3848,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector *out = nullptr; ++ std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); ++ out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i].assign(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -3869,7 +3870,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN + } + { + try { +- result = (arg1)->SetVocabulary((std::vector< std::string > const &)*arg2); ++ result = (arg1)->SetVocabulary((std::vector< absl::string_view > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -3955,7 +3956,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadVocabulary(PyObject *SWIGU + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -3983,6 +3984,66 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObject *SWIGUNUSEDPARM(self), Py_ssize_t nobjs, PyObject **swig_obj) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ float arg3 ; ++ float *arg4 = (float *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ float val3 ; ++ int ecode3 = 0 ; ++ void *argp4 = 0 ; ++ int res4 = 0 ; ++ sentencepiece::util::Status result; ++ ++ if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = ustring.str(); ++ } ++ ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); ++ } ++ arg3 = static_cast< float >(val3); ++ res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); ++ if (!SWIG_IsOK(res4)) { ++ SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); ++ } ++ arg4 = reinterpret_cast< float * >(argp4); ++ { ++ try { ++ result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ if (!(&result)->ok()) { ++ SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); ++ } ++ resultobj = SWIG_From_bool((&result)->ok()); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -4017,7 +4078,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(P + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -4054,9 +4115,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(P + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); + } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); ++ PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -4099,7 +4160,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -4135,9 +4196,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); + } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); ++ PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -4146,7 +4207,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_1(PyObject *SWIGUNUSEDPARM(self), Py_ssize_t nobjs, PyObject **swig_obj) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; +@@ -4155,10 +4216,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWI + int res1 = 0 ; + float val3 ; + int ecode3 = 0 ; +- PyObject *swig_obj[3] ; + float result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; ++ if ((nobjs < 3) || (nobjs > 3)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); +@@ -4171,7 +4231,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWI + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -4194,6 +4254,67 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *self, PyObject *args) { ++ Py_ssize_t argc; ++ PyObject *argv[5] = { ++ 0 ++ }; ++ ++ if (!(argc = SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 0, 4, argv))) SWIG_fail; ++ --argc; ++ if (argc == 3) { ++ int _v; ++ void *vptr = 0; ++ int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0); ++ _v = SWIG_CheckState(res); ++ if (_v) { ++ int res = SWIG_AsCharPtrAndSize(argv[1], 0, NULL, 0); ++ _v = SWIG_CheckState(res); ++ if (_v) { ++ { ++ int res = SWIG_AsVal_float(argv[2], NULL); ++ _v = SWIG_CheckState(res); ++ } ++ if (_v) { ++ return _wrap_SentencePieceProcessor_CalculateEntropy__SWIG_1(self, argc, argv); ++ } ++ } ++ } ++ } ++ if (argc == 4) { ++ int _v; ++ void *vptr = 0; ++ int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0); ++ _v = SWIG_CheckState(res); ++ if (_v) { ++ int res = SWIG_AsCharPtrAndSize(argv[1], 0, NULL, 0); ++ _v = SWIG_CheckState(res); ++ if (_v) { ++ { ++ int res = SWIG_AsVal_float(argv[2], NULL); ++ _v = SWIG_CheckState(res); ++ } ++ if (_v) { ++ void *vptr = 0; ++ int res = SWIG_ConvertPtr(argv[3], &vptr, SWIGTYPE_p_float, 0); ++ _v = SWIG_CheckState(res); ++ if (_v) { ++ return _wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(self, argc, argv); ++ } ++ } ++ } ++ } ++ } ++ ++fail: ++ SWIG_Python_RaiseOrModifyTypeError("Wrong number or type of arguments for overloaded function 'SentencePieceProcessor_CalculateEntropy'.\n" ++ " Possible C/C++ prototypes are:\n" ++ " sentencepiece::SentencePieceProcessor::CalculateEntropy(absl::string_view,float,float *) const\n" ++ " sentencepiece::SentencePieceProcessor::CalculateEntropy(absl::string_view,float) const\n"); ++ return 0; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -4247,7 +4368,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSED + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + { + try { +@@ -4675,7 +4796,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNU + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + { + try { +@@ -4741,7 +4862,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -4790,7 +4911,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); ++ PyList_SET_ITEM(resultobj, i, PyInt_FromLong(static_cast(result[i]))); + } + } + return resultobj; +@@ -4842,7 +4963,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -4892,7 +5013,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); + } + } + return resultobj; +@@ -4944,7 +5065,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObj + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -5046,7 +5167,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -5113,9 +5234,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + } +- PyList_SetItem(resultobj, i, obj); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + { +@@ -5177,7 +5298,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -5245,9 +5366,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); + } +- PyList_SetItem(resultobj, i, obj); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + { +@@ -5309,7 +5430,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -5374,7 +5495,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + } + } + { +@@ -5452,7 +5573,7 @@ fail: + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::string > *arg2 = 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[2] ; +@@ -5465,14 +5586,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUN + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector *out = nullptr; ++ std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); ++ out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i].assign(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -5487,7 +5608,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUN + } + { + try { +- result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); ++ result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5572,7 +5693,7 @@ fail: + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::string > *arg2 = 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + PyObject *swig_obj[2] ; +@@ -5585,14 +5706,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector *out = nullptr; ++ std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector(size); ++ out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i].assign(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -5607,7 +5728,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + } + { + try { +- result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5695,7 +5816,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIG + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); + } + } + { +@@ -5775,7 +5896,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBat + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + } + } + { +@@ -5793,7 +5914,7 @@ fail: + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::vector< std::string > > *arg2 = 0 ; ++ std::vector< std::vector< absl::string_view > > *arg2 = 0 ; + int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +@@ -5809,10 +5930,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector> *out = nullptr; ++ std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector>(size); ++ out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyList_Check(o)) { +@@ -5821,7 +5942,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { +- (*out)[i][j].assign(ustring.data(), ustring.size()); ++ (*out)[i][j] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; +@@ -5846,7 +5967,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S + arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5857,17 +5978,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *S + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputString(result[i], input_type)); + } + } +- { +- delete arg2; +- } + return resultobj; + fail: +- { +- delete arg2; +- } + return NULL; + } + +@@ -5875,7 +5990,7 @@ fail: + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- std::vector< std::vector< std::string > > *arg2 = 0 ; ++ std::vector< std::vector< absl::string_view > > *arg2 = 0 ; + int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; +@@ -5891,10 +6006,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +- std::vector> *out = nullptr; ++ std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); +- out = new std::vector>(size); ++ out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyList_Check(o)) { +@@ -5903,7 +6018,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { +- (*out)[i][j].assign(ustring.data(), ustring.size()); ++ (*out)[i][j] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; +@@ -5928,7 +6043,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5938,17 +6053,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + } + } +- { +- delete arg2; +- } + return resultobj; + fail: +- { +- delete arg2; +- } + return NULL; + } + +@@ -5990,7 +6099,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6031,9 +6140,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + } +- PyList_SetItem(resultobj, i, obj); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + return resultobj; +@@ -6079,7 +6188,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6121,9 +6230,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); + } +- PyList_SetItem(resultobj, i, obj); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + return resultobj; +@@ -6169,7 +6278,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto( + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6260,7 +6369,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6316,9 +6425,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); + } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); ++ PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -6373,7 +6482,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6430,9 +6539,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); ++ PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); + } +- PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); ++ PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); + } + } + return resultobj; +@@ -6466,7 +6575,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SW + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg2 = absl::string_view(ustring.data(), ustring.size()); ++ arg2 = ustring.str(); + } + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +@@ -6518,7 +6627,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObjec + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { +- (*out)[i] = absl::string_view(ustring.data(), ustring.size()); ++ (*out)[i] = ustring.str(); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; +@@ -6553,7 +6662,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObjec + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SetItem(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); ++ PyList_SET_ITEM(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); + } + } + { +@@ -6623,7 +6732,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceTrainer__TrainFromString(PyObject *SWIGU + SWIG_fail; + } + resultobj = ustring.input_type(); +- arg1 = absl::string_view(ustring.data(), ustring.size()); ++ arg1 = ustring.str(); + } + { + try { +@@ -6966,6 +7075,7 @@ static PyMethodDef SwigMethods_proxydocs[] = { + /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */ + + static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentenceIterator = {"_p_sentencepiece__SentenceIterator", "sentencepiece::SentenceIterator *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sentencepiece__SentencePieceProcessor", "sentencepiece::SentencePieceProcessor *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; +@@ -6973,12 +7083,12 @@ static swig_type_info _swigt__p_std__string = {"_p_std__string", "sentencepiece: + static swig_type_info _swigt__p_std__unordered_mapT_std__string_std__string_t = {"_p_std__unordered_mapT_std__string_std__string_t", "std::unordered_map< std::string,std::string > *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__vectorT_absl__string_view_t = {"_p_std__vectorT_absl__string_view_t", "std::vector< absl::string_view > *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__vectorT_int_t = {"_p_std__vectorT_int_t", "std::vector< int > *", 0, 0, (void*)0, 0}; +-static swig_type_info _swigt__p_std__vectorT_std__string_t = {"_p_std__vectorT_std__string_t", "std::vector< std::string > *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t = {"_p_std__vectorT_std__vectorT_absl__string_view_t_t", "std::vector< std::vector< absl::string_view > > *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__vectorT_std__vectorT_int_t_t", "std::vector< std::vector< int > > *", 0, 0, (void*)0, 0}; +-static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__string_t_t = {"_p_std__vectorT_std__vectorT_std__string_t_t", "std::vector< std::vector< std::string > > *", 0, 0, (void*)0, 0}; + + static swig_type_info *swig_type_initial[] = { + &_swigt__p_char, ++ &_swigt__p_float, + &_swigt__p_sentencepiece__SentenceIterator, + &_swigt__p_sentencepiece__SentencePieceProcessor, + &_swigt__p_sentencepiece__SentencePieceTrainer, +@@ -6986,12 +7096,12 @@ static swig_type_info *swig_type_initial[] = { + &_swigt__p_std__unordered_mapT_std__string_std__string_t, + &_swigt__p_std__vectorT_absl__string_view_t, + &_swigt__p_std__vectorT_int_t, +- &_swigt__p_std__vectorT_std__string_t, ++ &_swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t, + &_swigt__p_std__vectorT_std__vectorT_int_t_t, +- &_swigt__p_std__vectorT_std__vectorT_std__string_t_t, + }; + + static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_float[] = { {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentenceIterator[] = { {&_swigt__p_sentencepiece__SentenceIterator, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_swigt__p_sentencepiece__SentencePieceProcessor, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; +@@ -6999,12 +7109,12 @@ static swig_cast_info _swigc__p_std__string[] = { {&_swigt__p_std__string, 0, 0 + static swig_cast_info _swigc__p_std__unordered_mapT_std__string_std__string_t[] = { {&_swigt__p_std__unordered_mapT_std__string_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__vectorT_absl__string_view_t[] = { {&_swigt__p_std__vectorT_absl__string_view_t, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__vectorT_int_t[] = { {&_swigt__p_std__vectorT_int_t, 0, 0, 0},{0, 0, 0, 0}}; +-static swig_cast_info _swigc__p_std__vectorT_std__string_t[] = { {&_swigt__p_std__vectorT_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_std__vectorT_std__vectorT_absl__string_view_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_absl__string_view_t_t, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_int_t_t, 0, 0, 0},{0, 0, 0, 0}}; +-static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__string_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__string_t_t, 0, 0, 0},{0, 0, 0, 0}}; + + static swig_cast_info *swig_cast_initial[] = { + _swigc__p_char, ++ _swigc__p_float, + _swigc__p_sentencepiece__SentenceIterator, + _swigc__p_sentencepiece__SentencePieceProcessor, + _swigc__p_sentencepiece__SentencePieceTrainer, +@@ -7012,9 +7122,8 @@ static swig_cast_info *swig_cast_initial[] = { + _swigc__p_std__unordered_mapT_std__string_std__string_t, + _swigc__p_std__vectorT_absl__string_view_t, + _swigc__p_std__vectorT_int_t, +- _swigc__p_std__vectorT_std__string_t, ++ _swigc__p_std__vectorT_std__vectorT_absl__string_view_t_t, + _swigc__p_std__vectorT_std__vectorT_int_t_t, +- _swigc__p_std__vectorT_std__vectorT_std__string_t_t, + }; + + +diff --git a/src/builder.cc b/src/builder.cc +index 0fc7f24..822f6fc 100644 +--- a/src/builder.cc ++++ b/src/builder.cc +@@ -272,7 +272,7 @@ util::Status Builder::DecompileCharsMap(absl::string_view blob, + } + + // static +-util::Status Builder::GetPrecompiledCharsMap(const std::string &name, ++util::Status Builder::GetPrecompiledCharsMap(absl::string_view name, + std::string *output) { + CHECK_OR_RETURN(output); + +diff --git a/src/builder.h b/src/builder.h +index 95c5168..094da72 100644 +--- a/src/builder.h ++++ b/src/builder.h +@@ -51,7 +51,7 @@ class Builder { + CharsMap *chars_map); + + // Returns a pre-compiled binary index with `name`. +- static util::Status GetPrecompiledCharsMap(const std::string &name, ++ static util::Status GetPrecompiledCharsMap(absl::string_view name, + std::string *output); + + // Makes a normalization mapping based on NFKC. +diff --git a/src/common.h b/src/common.h +index 6ec4c09..ab07d85 100644 +--- a/src/common.h ++++ b/src/common.h +@@ -71,8 +71,7 @@ char (&ArraySizeHelper(const T (&array)[N]))[N]; + namespace sentencepiece { + #ifdef OS_WIN + namespace win32 { +-std::wstring Utf8ToWide(const std::string &input); +-std::string WideToUtf8(const std::wstring &input); ++std::wstring Utf8ToWide(const absl::string_view input); + } // namespace win32 + #endif + +diff --git a/src/error.cc b/src/error.cc +index a226d98..10faa2d 100644 +--- a/src/error.cc ++++ b/src/error.cc +@@ -61,15 +61,10 @@ struct Status::Rep { + std::string error_message; + }; + +-Status::Status(StatusCode code, const char* error_message) : rep_(new Rep) { +- rep_->code = code; +- rep_->error_message = error_message; +-} +- +-Status::Status(StatusCode code, const std::string& error_message) ++Status::Status(StatusCode code, absl::string_view error_message) + : rep_(new Rep) { + rep_->code = code; +- rep_->error_message = error_message; ++ rep_->error_message = std::string(error_message); + } + + Status::Status(const Status& s) +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index 4d697be..331fc90 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -48,6 +48,12 @@ const char kDefaultUnknownSymbol[] = " \xE2\x81\x87 "; + + // REPLACEMENT CHARACTER (U+FFFD) in UTF-8. + const char kReplacementCharacter[] = "\xef\xbf\xbd"; ++ ++std::vector ToPieceArray(const std::vector &v) { ++ std::vector out(v.size()); ++ for (int i = 0; i < v.size(); ++i) out[i] = v[i]; ++ return out; ++} + } // namespace + + SentencePieceProcessor::SentencePieceProcessor() {} +@@ -146,7 +152,7 @@ util::Status SentencePieceProcessor::status() const { + } + + util::Status SentencePieceProcessor::SetVocabulary( +- const std::vector &valid_vocab) { ++ const std::vector &valid_vocab) { + RETURN_IF_ERROR(status()); + + // TODO(taku): supports vocabulary constraint in BPE model. +@@ -154,7 +160,8 @@ util::Status SentencePieceProcessor::SetVocabulary( + CHECK_OR_RETURN(type == TrainerSpec::UNIGRAM || type == TrainerSpec::BPE) + << "Vocabulary constraint is only enabled in subword units."; + +- const std::set vocab(valid_vocab.begin(), valid_vocab.end()); ++ const std::set vocab(valid_vocab.begin(), ++ valid_vocab.end()); + + for (int i = 0; i < model_proto_->pieces_size(); ++i) { + auto *piece = model_proto_->mutable_pieces(i); +@@ -207,7 +214,7 @@ util::Status SentencePieceProcessor::LoadVocabulary(absl::string_view filename, + } + } + +- return SetVocabulary(vocab); ++ return SetVocabulary(ToPieceArray(vocab)); + } + + #define CHECK_OR_RETURN_STATUS_STL(container) \ +@@ -250,6 +257,12 @@ util::Status SentencePieceProcessor::Encode(absl::string_view input, + + util::Status SentencePieceProcessor::Decode( + const std::vector &pieces, std::string *detokenized) const { ++ return Decode(ToPieceArray(pieces), detokenized); ++} ++ ++util::Status SentencePieceProcessor::Decode( ++ const std::vector &pieces, ++ std::string *detokenized) const { + CHECK_OR_RETURN_STATUS_STL(detokenized); + + SentencePieceText spt; +@@ -593,6 +606,12 @@ util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input, + + util::Status SentencePieceProcessor::Decode( + const std::vector &pieces, SentencePieceText *spt) const { ++ return Decode(ToPieceArray(pieces), spt); ++} ++ ++util::Status SentencePieceProcessor::Decode( ++ const std::vector &pieces, ++ SentencePieceText *spt) const { + CHECK_OR_RETURN_STATUS_PROTO(spt); + + const char *unk_surface = kDefaultUnknownSymbol; +@@ -637,9 +656,9 @@ util::Status SentencePieceProcessor::Decode( + has_bos_ws); + }; + +- for (const std::string &w : pieces) { ++ for (absl::string_view w : pieces) { + auto *sp = spt->add_pieces(); +- sp->set_piece(w); ++ sp->mutable_piece()->assign(w.data(), w.size()); + sp->set_id(PieceToId(w)); + } + +@@ -779,6 +798,13 @@ std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( + return spt.SerializeAsString(); + } + ++std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( ++ const std::vector &pieces) const { ++ SentencePieceText spt; ++ if (!Decode(pieces, &spt).ok()) return ""; ++ return spt.SerializeAsString(); ++} ++ + std::string SentencePieceProcessor::DecodeIdsAsSerializedProto( + const std::vector &ids) const { + SentencePieceText spt; +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index 9d38214..8c72656 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -22,9 +22,11 @@ + #include + #include + ++#ifndef SWIG + namespace absl { + using std::string_view; + } ++#endif // SWIG + + namespace sentencepiece { + +@@ -58,8 +60,7 @@ class Status { + public: + Status(); + ~Status(); +- Status(StatusCode code, const char *error_message); +- Status(StatusCode code, const std::string &error_message); ++ Status(StatusCode code, absl::string_view error_message); + Status(const Status &s); + void operator=(const Status &s); + bool operator==(const Status &s) const; +@@ -204,7 +205,7 @@ class SentencePieceProcessor { + // Restricts the vocabulary set. + // The input sentences are encoded into the tokens in `valid_vocab`. + virtual util::Status SetVocabulary( +- const std::vector &valid_vocab); ++ const std::vector &valid_vocab); + + // Reverts the vocabulary restriction. + virtual util::Status ResetVocabulary(); +@@ -230,6 +231,10 @@ class SentencePieceProcessor { + virtual util::Status Decode(const std::vector &pieces, + std::string *detokenized) const; + ++ // Given a sequence of pieces, decodes it into a detokenized output. ++ virtual util::Status Decode(const std::vector &pieces, ++ std::string *detokenized) const; ++ + // Given a sequence of ids, decodes it into a detokenized output. + virtual util::Status Decode(const std::vector &ids, + std::string *detokenized) const; +@@ -320,16 +325,19 @@ class SentencePieceProcessor { + absl::string_view input, int samples, float theta, bool wor, + bool include_best, NBestSentencePieceText *samples_spt) const; + +-#ifndef SWIG + // Calculate entropy of possible tokenisations + virtual util::Status CalculateEntropy(absl::string_view input, float theta, + float *entropy) const; +-#endif + + // Given a sequence of pieces, decodes it into SentencePieceText. ++ // TODO(taku): Remove this API and use std::vector + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + ++ // Given a sequence of pieces, decodes it into SentencePieceText. ++ virtual util::Status Decode(const std::vector &pieces, ++ SentencePieceText *spt) const; ++ + // Given a sequence of ids, decodes it into SentencePieceText. + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; +@@ -401,11 +409,17 @@ class SentencePieceProcessor { + theta, wor, include_best); + } + ++ // TODO(taku): Remove this API and use std::vector + virtual std::string DecodePieces( + const std::vector &pieces) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); + } + ++ virtual std::string DecodePieces( ++ const std::vector &pieces) const { ++ DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); ++ } ++ + virtual std::string DecodeIds(const std::vector &ids) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids); + } +@@ -428,9 +442,13 @@ class SentencePieceProcessor { + virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input, + int nbest_size) const; + ++ // TODO(taku): Remove this API and use std::vector + virtual util::bytes DecodePiecesAsSerializedProto( + const std::vector &pieces) const; + ++ virtual util::bytes DecodePiecesAsSerializedProto( ++ const std::vector &pieces) const; ++ + virtual util::bytes DecodeIdsAsSerializedProto( + const std::vector &ids) const; + +diff --git a/src/sentencepiece_trainer.h b/src/sentencepiece_trainer.h +index bb74ab9..b4af6f0 100644 +--- a/src/sentencepiece_trainer.h ++++ b/src/sentencepiece_trainer.h +@@ -129,12 +129,12 @@ class SentencePieceTrainer { + // with comma-separated values. `field_name` must not be a nested message. + // The body of these functions are automatically generated with + // data/gen_spec_parser.pl +- static util::Status SetProtoField(const std::string &name, +- const std::string &value, ++ static util::Status SetProtoField(absl::string_view name, ++ absl::string_view value, + TrainerSpec *message); + +- static util::Status SetProtoField(const std::string &name, +- const std::string &value, ++ static util::Status SetProtoField(absl::string_view name, ++ absl::string_view value, + NormalizerSpec *message); + + // Populates model type from string representation, e.g., "bpe". +diff --git a/src/spec_parser.h b/src/spec_parser.h +index b5713fb..de8f72f 100644 +--- a/src/spec_parser.h ++++ b/src/spec_parser.h +@@ -25,10 +25,10 @@ + + namespace sentencepiece { + +-#define PARSE_STRING(param_name) \ +- if (name == #param_name) { \ +- message->set_##param_name(value); \ +- return util::OkStatus(); \ ++#define PARSE_STRING(param_name) \ ++ if (name == #param_name) { \ ++ message->set_##param_name(std::string(value)); \ ++ return util::OkStatus(); \ + } + + #define PARSE_REPEATED_STRING(param_name) \ +@@ -189,8 +189,8 @@ inline std::string PrintProto(const NormalizerSpec &message, + return os.str(); + } + +-util::Status SentencePieceTrainer::SetProtoField(const std::string &name, +- const std::string &value, ++util::Status SentencePieceTrainer::SetProtoField(absl::string_view name, ++ absl::string_view value, + TrainerSpec *message) { + CHECK_OR_RETURN(message); + +@@ -249,8 +249,8 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name, + << "unknown field name \"" << name << "\" in TrainerSpec."; + } + +-util::Status SentencePieceTrainer::SetProtoField(const std::string &name, +- const std::string &value, ++util::Status SentencePieceTrainer::SetProtoField(absl::string_view name, ++ absl::string_view value, + NormalizerSpec *message) { + CHECK_OR_RETURN(message); + +diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc +index 4d12a38..b0e508d 100644 +--- a/src/spm_encode_main.cc ++++ b/src/spm_encode_main.cc +@@ -92,13 +92,13 @@ int main(int argc, char *argv[]) { + absl::flat_hash_map vocab; + sentencepiece::SentencePieceText spt; + sentencepiece::NBestSentencePieceText nbest_spt; +- std::function process; ++ std::function process; + + const int nbest_size = absl::GetFlag(FLAGS_nbest_size); + const float alpha = absl::GetFlag(FLAGS_alpha); + + if (absl::GetFlag(FLAGS_generate_vocabulary)) { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.Encode(line, &spt)); + for (const auto &piece : spt.pieces()) { + if (!sp.IsUnknown(piece.id()) && !sp.IsControl(piece.id())) +@@ -106,47 +106,47 @@ int main(int argc, char *argv[]) { + } + }; + } else if (absl::GetFlag(FLAGS_output_format) == "piece") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.Encode(line, &sps)); + output->WriteLine(absl::StrJoin(sps, " ")); + }; + } else if (absl::GetFlag(FLAGS_output_format) == "id") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.Encode(line, &ids)); + output->WriteLine(absl::StrJoin(ids, " ")); + }; + } else if (absl::GetFlag(FLAGS_output_format) == "proto") { +- process = [&](const std::string &line) { CHECK_OK(sp.Encode(line, &spt)); }; ++ process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &spt)); }; + } else if (absl::GetFlag(FLAGS_output_format) == "sample_piece") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &sps)); + output->WriteLine(absl::StrJoin(sps, " ")); + }; + } else if (absl::GetFlag(FLAGS_output_format) == "sample_id") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &ids)); + output->WriteLine(absl::StrJoin(ids, " ")); + }; + } else if (absl::GetFlag(FLAGS_output_format) == "sample_proto") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.SampleEncode(line, nbest_size, alpha, &spt)); + }; + } else if (absl::GetFlag(FLAGS_output_format) == "nbest_piece") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_sps)); + for (const auto &result : nbest_sps) { + output->WriteLine(absl::StrJoin(result, " ")); + } + }; + } else if (absl::GetFlag(FLAGS_output_format) == "nbest_id") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_ids)); + for (const auto &result : nbest_ids) { + output->WriteLine(absl::StrJoin(result, " ")); + } + }; + } else if (absl::GetFlag(FLAGS_output_format) == "nbest_proto") { +- process = [&](const std::string &line) { ++ process = [&](absl::string_view line) { + CHECK_OK(sp.NBestEncode(line, nbest_size, &nbest_spt)); + }; + } else { +diff --git a/src/util.cc b/src/util.cc +index f99c73a..f54e8ba 100644 +--- a/src/util.cc ++++ b/src/util.cc +@@ -244,15 +244,16 @@ std::vector StrSplitAsCSV(absl::string_view text) { + + #ifdef OS_WIN + namespace win32 { +-std::wstring Utf8ToWide(const std::string &input) { +- int output_length = +- ::MultiByteToWideChar(CP_UTF8, 0, input.c_str(), -1, nullptr, 0); ++std::wstring Utf8ToWide(absl::string_view input) { ++ int output_length = ::MultiByteToWideChar( ++ CP_UTF8, 0, input.data(), static_cast(input.size()), nullptr, 0); + output_length = output_length <= 0 ? 0 : output_length - 1; + if (output_length == 0) { + return L""; + } + std::unique_ptr input_wide(new wchar_t[output_length + 1]); +- const int result = ::MultiByteToWideChar(CP_UTF8, 0, input.c_str(), -1, ++ const int result = ::MultiByteToWideChar(CP_UTF8, 0, input.data(), ++ static_cast(input.size()), + input_wide.get(), output_length + 1); + std::wstring output; + if (result > 0) { +@@ -260,24 +261,6 @@ std::wstring Utf8ToWide(const std::string &input) { + } + return output; + } +- +-std::string WideToUtf8(const std::wstring &input) { +- const int output_length = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, +- nullptr, 0, nullptr, nullptr); +- if (output_length == 0) { +- return ""; +- } +- +- std::unique_ptr input_encoded(new char[output_length + 1]); +- const int result = +- ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, input_encoded.get(), +- output_length + 1, nullptr, nullptr); +- std::string output; +- if (result > 0) { +- output.assign(input_encoded.get()); +- } +- return output; +-} + } // namespace win32 + #endif + } // namespace sentencepiece diff --git a/patches/0009-Fixed-build-break.patch b/patches/0009-Fixed-build-break.patch new file mode 100644 index 0000000..d807f8c --- /dev/null +++ b/patches/0009-Fixed-build-break.patch @@ -0,0 +1,21 @@ +From: Taku Kudo +Date: Wed, 15 Jun 2022 02:22:05 +0900 +Subject: Fixed build break. + +Signed-off-by: Kentaro Hayashi +--- + src/common.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/common.h b/src/common.h +index ab07d85..c27c352 100644 +--- a/src/common.h ++++ b/src/common.h +@@ -26,6 +26,7 @@ + #include + + #include "config.h" ++#include "third_party/absl/strings/string_view.h" + + #if defined(_WIN32) && !defined(__CYGWIN__) + #define OS_WIN diff --git a/patches/0010-Added-ImmutableSentencePiece-class.patch b/patches/0010-Added-ImmutableSentencePiece-class.patch new file mode 100644 index 0000000..d73f3d1 --- /dev/null +++ b/patches/0010-Added-ImmutableSentencePiece-class.patch @@ -0,0 +1,1648 @@ +From: Taku Kudo +Date: Mon, 20 Jun 2022 00:55:46 +0900 +Subject: Added ImmutableSentencePiece class + +Signed-off-by: Kentaro Hayashi +--- + src/bpe_model.cc | 6 +- + src/model_interface.h | 26 +-- + src/model_interface_test.cc | 19 +-- + src/sentencepiece_processor.cc | 173 ++++++++++++------- + src/sentencepiece_processor.h | 332 ++++++++++++++++++++++++++---------- + src/sentencepiece_processor_test.cc | 102 ++++++++--- + src/unigram_model.cc | 70 ++++---- + src/unigram_model.h | 15 ++ + src/unigram_model_test.cc | 114 +++++++------ + src/util.h | 11 -- + 10 files changed, 557 insertions(+), 311 deletions(-) + +diff --git a/src/bpe_model.cc b/src/bpe_model.cc +index 22cd115..bc7ada1 100644 +--- a/src/bpe_model.cc ++++ b/src/bpe_model.cc +@@ -12,6 +12,8 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + ++#include "bpe_model.h" ++ + #include + #include + #include +@@ -19,7 +21,6 @@ + #include + #include + +-#include "bpe_model.h" + #include "freelist.h" + #include "third_party/absl/container/flat_hash_map.h" + #include "util.h" +@@ -71,8 +72,7 @@ std::vector> Model::SampleEncode( + // Reverse merge rules. + // key: merged symbol, value: pair of original symbols. + absl::flat_hash_map, +- string_util::string_view_hash> ++ std::pair> + rev_merge; + + // Pre-allocates SymbolPair for efficiency. +diff --git a/src/model_interface.h b/src/model_interface.h +index 06b3a65..06e9243 100644 +--- a/src/model_interface.h ++++ b/src/model_interface.h +@@ -53,8 +53,8 @@ class ModelProto; + // Given a normalized string, returns a sequence of sentence pieces with ids. + class ModelInterface { + public: +- using PieceToIdMap = absl::flat_hash_map; ++ using PieceToIdMap = absl::flat_hash_map; ++ // string_util::string_view_hash>; + + absl::string_view unk_piece() const; + absl::string_view bos_piece() const; +@@ -77,19 +77,6 @@ class ModelInterface { + return matcher_.get(); + } + +- // Sets the encoder version. Currently only unigram has an optimized encoder. +- // The optimized version is always used by default if there is one, so +- // normally users do not need to call this function. This function is provided +- // just in case that a user want to manually choose which encoder version to +- // use. +- virtual util::Status SetEncoderVersion(EncoderVersion encoder_version) { +- encoder_version_ = encoder_version; +- return util::OkStatus(); +- } +- +- // Returns the current encoder version in use. +- virtual EncoderVersion GetEncoderVersion() const { return encoder_version_; } +- + // Given a normalized string, returns a sequence of sentence pieces with ids. + // The concatenation of pieces must be the same as `normalized`. + virtual EncodeResult Encode(absl::string_view normalized) const = 0; +@@ -123,10 +110,9 @@ class ModelInterface { + } + + // Calculates the entropy of the segmentation lattice with inverse temperature +- // `theta`. +- // Uses a novel dynamic program to calculate the entropy. ++ // `alpha`. Uses a novel dynamic program to calculate the entropy. + virtual float CalculateEntropy(absl::string_view normalized, +- float theta) const { ++ float alpha) const { + LOG(ERROR) << "Not implemented."; + return 0.0; + } +@@ -256,10 +242,6 @@ class ModelInterface { + // unknown id. + int unk_id_ = 0; + +- // The encoder version. Currently it is only effective for unigram model but +- // ignored by other models. +- EncoderVersion encoder_version_ = EncoderVersion::kOptimized; +- + // status. + util::Status status_; + }; +diff --git a/src/model_interface_test.cc b/src/model_interface_test.cc +index 69ee4e6..09e41d3 100644 +--- a/src/model_interface_test.cc ++++ b/src/model_interface_test.cc +@@ -12,8 +12,9 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + +-#include "model_factory.h" + #include "model_interface.h" ++ ++#include "model_factory.h" + #include "testharness.h" + #include "third_party/absl/container/flat_hash_map.h" + #include "util.h" +@@ -481,22 +482,6 @@ TEST(ModelInterfaceTest, PieceToByteTest) { + EXPECT_EQ(PieceToByte("a"), -1); + } + +-TEST(ModelInterfaceTest, SetEncoderVersion) { +- for (const auto type : kModelTypes) { +- ModelProto model_proto = MakeBaseModelProto(type); +- AddPiece(&model_proto, "a"); +- AddPiece(&model_proto, "b"); +- auto model = ModelFactory::Create(model_proto); +- +- // Verify the default encoder version. +- EXPECT_EQ(EncoderVersion::kOptimized, model->GetEncoderVersion()); +- +- // Set the encoder version to original and verify. +- EXPECT_TRUE(model->SetEncoderVersion(EncoderVersion::kOriginal).ok()); +- EXPECT_EQ(EncoderVersion::kOriginal, model->GetEncoderVersion()); +- } +-} +- + TEST(ModelInterfaceTest, VerifyOutputsEquivalent) { + for (const auto type : kModelTypes) { + ModelProto model_proto = MakeBaseModelProto(type); +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index 331fc90..a6f5395 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -56,6 +56,112 @@ std::vector ToPieceArray(const std::vector &v) { + } + } // namespace + ++ImmutableSentencePieceText::ImmutableSentencePieceText() {} ++ImmutableSentencePieceText::~ImmutableSentencePieceText() {} ++ ++ImmutableSentencePieceText::ImmutableSentencePieceText( ++ const SentencePieceText &spt) ++ : spt_(&spt) {} ++ ++ImmutableSentencePieceText::ImmutableSentencePiece::ImmutableSentencePiece( ++ const SentencePieceText_SentencePiece &sp) ++ : sp_(&sp) {} ++ ++absl::string_view ImmutableSentencePieceText::ImmutableSentencePiece::piece() ++ const { ++ return sp_->piece(); ++} ++ ++absl::string_view ImmutableSentencePieceText::ImmutableSentencePiece::surface() ++ const { ++ return sp_->surface(); ++} ++ ++uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::id() const { ++ return sp_->id(); ++} ++ ++uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::begin() const { ++ return sp_->begin(); ++} ++ ++uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::end() const { ++ return sp_->end(); ++} ++ ++std::vector ++ImmutableSentencePieceText::pieces() const { ++ std::vector pieces; ++ if (spt_ == nullptr) return pieces; ++ pieces.reserve(spt_->pieces_size()); ++ for (int i = 0; i < spt_->pieces_size(); ++i) ++ pieces[i] = ImmutableSentencePiece(spt_->pieces(i)); ++ return pieces; ++} ++ ++size_t ImmutableSentencePieceText::pieces_size() const { ++ return spt_ ? spt_->pieces_size() : 0; ++} ++ ++ImmutableSentencePieceText::ImmutableSentencePiece ++ImmutableSentencePieceText::pieces(int index) const { ++ return ImmutableSentencePieceText::ImmutableSentencePiece( ++ spt_->pieces(index)); ++} ++ ++absl::string_view ImmutableSentencePieceText::text() const { ++ return spt_ ? spt_->text() : ""; ++} ++ ++float ImmutableSentencePieceText::score() const { ++ return spt_ ? spt_->score() : 0.0; ++} ++ ++SentencePieceText *ImmutableSentencePieceText::mutable_proto() { ++ if (rep_ == nullptr) { ++ rep_ = std::make_shared(); ++ spt_ = rep_.get(); ++ } ++ return rep_.get(); ++} ++ ++std::string ImmutableSentencePieceText::SerializeAsString() const { ++ return spt_ ? spt_->SerializeAsString() : ""; ++} ++ ++ImmutableNBestSentencePieceText::ImmutableNBestSentencePieceText() {} ++ImmutableNBestSentencePieceText::~ImmutableNBestSentencePieceText() {} ++ ++size_t ImmutableNBestSentencePieceText::nbests_size() const { ++ return rep_ ? rep_->nbests_size() : 0; ++} ++ ++ImmutableSentencePieceText ImmutableNBestSentencePieceText::nbests( ++ int index) const { ++ return ImmutableSentencePieceText(rep_->nbests(index)); ++} ++ ++std::vector ++ImmutableNBestSentencePieceText::nbests() const { ++ std::vector nbests; ++ if (rep_ == nullptr) return nbests; ++ nbests.reserve(rep_->nbests_size()); ++ for (int i = 0; i < rep_->nbests_size(); ++i) ++ nbests[i] = ImmutableSentencePieceText(rep_->nbests(i)); ++ return nbests; ++} ++ ++NBestSentencePieceText *ImmutableNBestSentencePieceText::mutable_proto() { ++ if (rep_ == nullptr) { ++ rep_ = std::make_shared(); ++ } ++ return rep_.get(); ++} ++ ++std::string ImmutableNBestSentencePieceText::SerializeAsString() const { ++ return rep_ ? rep_->SerializeAsString() : ""; ++} ++ + SentencePieceProcessor::SentencePieceProcessor() {} + SentencePieceProcessor::~SentencePieceProcessor() {} + +@@ -124,15 +230,6 @@ util::Status SentencePieceProcessor::Load( + return util::OkStatus(); + } + +-util::Status SentencePieceProcessor::SetEncoderVersion( +- EncoderVersion encoder_version) { +- return model_->SetEncoderVersion(encoder_version); +-} +- +-EncoderVersion SentencePieceProcessor::GetEncoderVersion() const { +- return model_->GetEncoderVersion(); +-} +- + util::Status SentencePieceProcessor::SetEncodeExtraOptions( + absl::string_view extra_options) { + return ParseExtraOptions(extra_options, &encode_extra_options_); +@@ -348,14 +445,14 @@ util::Status SentencePieceProcessor::SampleEncode(absl::string_view input, + } + + util::Status SentencePieceProcessor::SampleEncodeAndScore( +- absl::string_view input, int num_samples, float theta, bool wor, ++ absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *pieces) const { + CHECK_OR_RETURN_STATUS_STL(pieces); + + NBestSentencePieceText spt; + RETURN_IF_ERROR( +- SampleEncodeAndScore(input, num_samples, theta, wor, include_best, &spt)); ++ SampleEncodeAndScore(input, num_samples, alpha, wor, include_best, &spt)); + + pieces->clear(); + pieces->reserve(spt.nbests_size()); +@@ -373,14 +470,14 @@ util::Status SentencePieceProcessor::SampleEncodeAndScore( + } + + util::Status SentencePieceProcessor::SampleEncodeAndScore( +- absl::string_view input, int num_samples, float theta, bool wor, ++ absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *ids) const { + CHECK_OR_RETURN_STATUS_STL(ids); + + NBestSentencePieceText spt; + RETURN_IF_ERROR( +- SampleEncodeAndScore(input, num_samples, theta, wor, include_best, &spt)); ++ SampleEncodeAndScore(input, num_samples, alpha, wor, include_best, &spt)); + + ids->clear(); + ids->reserve(spt.nbests_size()); +@@ -568,7 +665,7 @@ util::Status SentencePieceProcessor::SampleEncode( + } + + util::Status SentencePieceProcessor::SampleEncodeAndScore( +- absl::string_view input, int samples, float theta, bool wor, ++ absl::string_view input, int samples, float alpha, bool wor, + bool include_best, NBestSentencePieceText *samples_spt) const { + CHECK_OR_RETURN(model_->IsSampleEncodeAndScoreAvailable()) + << "SampleEncodeAndScore is not available for the current model."; +@@ -576,7 +673,7 @@ util::Status SentencePieceProcessor::SampleEncodeAndScore( + std::vector norm_to_orig; + RETURN_IF_ERROR(normalizer_->Normalize(input, &normalized, &norm_to_orig)); + +- const auto results = model_->SampleEncodeAndScore(normalized, theta, samples, ++ const auto results = model_->SampleEncodeAndScore(normalized, alpha, samples, + wor, include_best); + CHECK_OR_RETURN(!results.empty()) + << "SampleEncodeAndScore returns empty result."; +@@ -592,7 +689,7 @@ util::Status SentencePieceProcessor::SampleEncodeAndScore( + } + + util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input, +- float theta, ++ float alpha, + float *entropy) const { + CHECK_OR_RETURN(model_->IsCalculateEntropyAvailable()) + << "CalculateEntropy is not available for the current model."; +@@ -600,7 +697,7 @@ util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input, + std::vector norm_to_orig; + RETURN_IF_ERROR(normalizer_->Normalize(input, &normalized, &norm_to_orig)); + +- *entropy = model_->CalculateEntropy(normalized, theta); ++ *entropy = model_->CalculateEntropy(normalized, alpha); + return util::OkStatus(); + } + +@@ -770,48 +867,6 @@ util::Status SentencePieceProcessor::Decode(const std::vector &ids, + return Decode(pieces, spt); + } + +-std::string SentencePieceProcessor::EncodeAsSerializedProto( +- absl::string_view input) const { +- SentencePieceText spt; +- if (!Encode(input, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- +-std::string SentencePieceProcessor::SampleEncodeAsSerializedProto( +- absl::string_view input, int nbest_size, float alpha) const { +- SentencePieceText spt; +- if (!SampleEncode(input, nbest_size, alpha, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- +-std::string SentencePieceProcessor::NBestEncodeAsSerializedProto( +- absl::string_view input, int nbest_size) const { +- NBestSentencePieceText spt; +- if (!NBestEncode(input, nbest_size, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- +-std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( +- const std::vector &pieces) const { +- SentencePieceText spt; +- if (!Decode(pieces, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- +-std::string SentencePieceProcessor::DecodePiecesAsSerializedProto( +- const std::vector &pieces) const { +- SentencePieceText spt; +- if (!Decode(pieces, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- +-std::string SentencePieceProcessor::DecodeIdsAsSerializedProto( +- const std::vector &ids) const { +- SentencePieceText spt; +- if (!Decode(ids, &spt).ok()) return ""; +- return spt.SerializeAsString(); +-} +- + #define CHECK_STATUS_OR_RETURN_DEFAULT(value) \ + if (!status().ok()) { \ + LOG(ERROR) << status().message() << "\nReturns default value " << value; \ +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index 8c72656..51c5b3b 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -29,11 +29,6 @@ using std::string_view; + #endif // SWIG + + namespace sentencepiece { +- +-#ifndef SWIG +-using EncodeResult = std::vector>; +-#endif // SWIG +- + namespace util { + + enum class StatusCode : int { +@@ -107,17 +102,17 @@ class Status { + // sp.Load("//path/to/model"); + // + // vector sps; +-// sp.Encode("hello world.", &sps); ++// sp.Encode("hello world.", &sps).IgnoreError(); + // + // vector ids; +-// sp.Encode("hello world.", &ids); ++// sp.Encode("hello world.", &ids).IgnoreError(); + // + // string detok; + // sp.Decode(sps, &detok); +-// CHECK_EQ("hello world.", detok); ++// CHECK_EQ("hello world.", detok).IgnoreError(); + // + // sp.Decode(ids, &detok); +-// CHECK_EQ("hello world.", detok); ++// CHECK_EQ("hello world.", detok).IgnoreError(); + // + // We can also use SentencePieceText which manages the byte-offsets + // between user input (output) and internal sentence pieces. +@@ -144,16 +139,6 @@ namespace normalizer { + class Normalizer; + } // namespace normalizer + +-#ifndef SWIG +-// Defines the multiple versions of encoder within each model. Currently only +-// the Unigram model has an optimized encoder. +-enum class EncoderVersion { +- kOptimized, // The optimized encoder (default). +- kOriginal // The original encoder (user may choose to fall back to this +- // just in case). +-}; +-#endif +- + #ifndef SWIGGO + namespace util { + // Redefine std::string for serialized_proto interface as Python's string is +@@ -161,7 +146,87 @@ namespace util { + // with SWIG's typemap. + using bytes = std::string; + } // namespace util +-#endif ++#endif // SWIGGO ++ ++class NBestSentencePieceText; ++class ModelInterface; ++class SentencePieceText; ++class SentencePieceText_SentencePiece; ++ ++// Wrapper class of SentencePieceText ++// This wrapper only allows an immutable access to the proto and ++// hides the actual implementation of protobuf. ++// See sentencepiece.proto for the details of this class. ++class ImmutableSentencePieceText { ++ public: ++ ImmutableSentencePieceText(); ++ virtual ~ImmutableSentencePieceText(); ++ ++ class ImmutableSentencePiece { ++ public: ++ ~ImmutableSentencePiece() = default; ++ absl::string_view piece() const; ++ absl::string_view surface() const; ++ uint32_t id() const; ++ uint32_t begin() const; ++ uint32_t end() const; ++ ++ friend class ImmutableSentencePieceText; ++ ++ private: ++ ImmutableSentencePiece() = default; ++ explicit ImmutableSentencePiece(const SentencePieceText_SentencePiece &sp); ++ const SentencePieceText_SentencePiece *sp_ = nullptr; ++ }; ++ ++ std::vector pieces() const; ++ size_t pieces_size() const; ++ ImmutableSentencePiece pieces(int index) const; ++ absl::string_view text() const; ++ float score() const; ++ ++ std::string SerializeAsString() const; ++ ++ // Returns the actual mutable proto. ++ // Do not use this outside of SentencePieceProcessor, as ++ // it returns the raw pointer managed by the shared_ptr. ++ SentencePieceText *mutable_proto(); ++ ++ friend class ImmutableNBestSentencePieceText; ++ friend class SentencePieceProcessor; ++ ++ private: ++ explicit ImmutableSentencePieceText(const SentencePieceText &spt); ++ const SentencePieceText *spt_ = nullptr; ++ std::shared_ptr rep_; ++}; ++ ++// Wrapper class of SentencePieceText ++// This wrapper only allows an immutable access to the proto and ++// hides the actual implementation of protobuf. ++// See sentencepiece.proto for the details of this class. ++class ImmutableNBestSentencePieceText { ++ public: ++ ImmutableNBestSentencePieceText(); ++ virtual ~ImmutableNBestSentencePieceText(); ++ ++ std::vector nbests() const; ++ ++ size_t nbests_size() const; ++ ImmutableSentencePieceText nbests(int index) const; ++ ++ std::string SerializeAsString() const; ++ ++ // Returns the actual mutable proto. ++ // Do not use this outside of SentencePieceProcessor, as ++ // it returns the raw pointer managed by the shared_ptr. ++ NBestSentencePieceText *mutable_proto(); ++ ++ friend class SentencePieceProcessor; ++ ++ private: ++ std::shared_ptr rep_; ++}; + + class SentencePieceProcessor { + public: +@@ -217,7 +282,7 @@ class SentencePieceProcessor { + int threshold); + + ////////////////////////////////////////////////////////////// +- // Simple API. ++ // Simple Encode and Decode API. + // + // Given a UTF8 input, encodes it into a sequence of sentence pieces. + virtual util::Status Encode(absl::string_view input, +@@ -239,18 +304,9 @@ class SentencePieceProcessor { + virtual util::Status Decode(const std::vector &ids, + std::string *detokenized) const; + +-#ifndef SWIG +- // Sets the encoder version. Normally users do not need to call this function. +- // But they can call this fucntion just in case if they want to fall back to +- // the original encoder. +- virtual util::Status SetEncoderVersion(EncoderVersion encoder_version); +- +- // Returns the current encoder version in use. +- virtual EncoderVersion GetEncoderVersion() const; +-#endif +- + ////////////////////////////////////////////////////////////// + // NBest API. ++ // + // Same as Encode, but returns nbest results. + virtual util::Status NBestEncode( + absl::string_view input, int nbest_size, +@@ -262,24 +318,24 @@ class SentencePieceProcessor { + + ////////////////////////////////////////////////////////////// + // Sampling API. ++ // + // Unigram and BPE support sampling mode. + // - Unigram (--model_type=unigram): +- // When `nbest_size` is positive value, approximately samples one +- // segmentation from nbest candidates. When `nbest_size` is negative value, +- // samples one segmentation from the hypotheses (Lattice) according to the +- // generation probabilities using forward-filtering and backward-sampling +- // algorithm. `alpha` is a smoothing parameter. The best segmentation +- // (Viterbi segmentation) is more likely sampled when setting larger +- // alpha. When alpha is 0.0, one segmentation is uniformly sampled from the +- // nbest or lattice. +- // `nbest_size` and `alpha` correspond to parameters `l` and `alpha` ++ // `nbest_size`: When `nbest_size` is positive value, approximately samples ++ // one segmentation from nbest candidates. When `nbest_size` is negative ++ // value, samples one segmentation from the hypotheses (Lattice) according to ++ // the generation probabilities using forward-filtering and backward-sampling ++ // algorithm. ++ // `alpha`: Smoothing parameter (inverse temperature). The best segmentation ++ // (Viterbi segmentation) is more likely sampled when setting larger alpha. ++ // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or ++ // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha` + // in https://arxiv.org/abs/1804.10959 (nbest_size < 0 means l = infinity) + // + // - BPE (--model_type=bpe): +- // `alpha` is the dropout probability `p` of bpe merge operations +- // in https://arxiv.org/abs/1910.13267 +- // Nbest-based sampling is not supported so nbest_size parameter is ignored in +- // BPE. ++ // `alpha`: The dropout probability `p` of bpe merge operations in ++ // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so ++ // nbest_size parameter is ignored in BPE. + virtual util::Status SampleEncode(absl::string_view input, int nbest_size, + float alpha, + std::vector *pieces) const; +@@ -290,74 +346,104 @@ class SentencePieceProcessor { + + ////////////////////////////////////////////////////////////// + // SampleEncodeAndScore API. +- // Similar to SampleEncode, but returns samples results. ++ // ++ // Sample `samples` many tokenisations from the segmentation lattice. ++ // These methods are only available in model_type=unigram. ++ // ++ // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in ++ // `Sample` method. ++ // 'wor`: If `wor` is true, the samples are taken without replacement, and the ++ // scores are the inclusion probabilities of the elements in the sample; ++ // otherwise the samples are taken with replacement and the scores are the ++ // log-probs of sample elements ++ // `include_best`: If `include_best` is true, the best tokenisation is always ++ // included in the sample, and the remaining elements are sampled excluding ++ // the best. + virtual util::Status SampleEncodeAndScore( +- absl::string_view input, int num_samples, float theta, bool wor, ++ absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *pieces) const; + + // Same as above, but returns a sequence of ids. + virtual util::Status SampleEncodeAndScore( +- absl::string_view input, int num_samples, float theta, bool wor, ++ absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *ids) const; + ++ ////////////////////////////////////////////////////////////// ++ // Entropy API. ++ // ++ // This only available in model_type=unigram. ++ // Calculate entropy of possible tokenisations ++ virtual util::Status CalculateEntropy(absl::string_view input, float alpha, ++ float *entropy) const; ++ + ////////////////////////////////////////////////////////////// + // Advanced API returning SentencePieceText, which manages + // utf8-byte alignments between user-input/detokenized text + // and internal sentencepiece sequence. + // + // Given a UTF8 input, encodes it into SentencePieceText. ++ // ++ // When using these APIs, sentencepiece.pb.h header files must be included. ++ // We can also use ImutableSentencePieceText as follows. ++ // ++ // ImmutableSentencePieceText spt; ++ // Encode("hello", spt.mutable_proto()).IgnoreError(); ++ // std::cout << spt.pieces_size() << std::endl; + virtual util::Status Encode(absl::string_view input, + SentencePieceText *spt) const; + +- // Same as above, but returns NBestSentencePieceText. + virtual util::Status NBestEncode(absl::string_view input, int nbest_size, + NBestSentencePieceText *nbest_spt) const; + +- // Same as above, but samples one segmentation from the hypotheses +- // (Lattice). + virtual util::Status SampleEncode(absl::string_view input, int nbest_size, + float alpha, SentencePieceText *spt) const; + +- // Samples N segmentation and returns the scores as well + virtual util::Status SampleEncodeAndScore( +- absl::string_view input, int samples, float theta, bool wor, ++ absl::string_view input, int samples, float alpha, bool wor, + bool include_best, NBestSentencePieceText *samples_spt) const; + +- // Calculate entropy of possible tokenisations +- virtual util::Status CalculateEntropy(absl::string_view input, float theta, +- float *entropy) const; +- +- // Given a sequence of pieces, decodes it into SentencePieceText. +- // TODO(taku): Remove this API and use std::vector ++ // DEPRECATED: Remove this API and use std::vector + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + +- // Given a sequence of pieces, decodes it into SentencePieceText. + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + +- // Given a sequence of ids, decodes it into SentencePieceText. + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; + +- ////////////////////////////////////////////////////////////// +- // Handy methods that return the result directly. +- // These functions ignore internal errors. + #ifdef SWIG +-#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ +- OutType output; \ +- const auto _status = FuncName(__VA_ARGS__, &output); \ +- if (!_status.ok()) throw _status; \ +- return output; ++#define SPP_SWIG_CHECK_AND_THROW \ ++ if (!status.ok()) throw status; + #else ++#define SPP_SWIG_CHECK_AND_THROW \ ++ if (!status.ok()) { \ ++ } ++#endif // SWIG ++ + #define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ + OutType output; \ +- FuncName(__VA_ARGS__, &output).IgnoreError(); \ ++ const auto status = FuncName(__VA_ARGS__, &output); \ ++ SPP_SWIG_CHECK_AND_THROW; \ ++ return output; ++ ++#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ ++ OutType output; \ ++ const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ ++ SPP_SWIG_CHECK_AND_THROW; \ ++ return output.SerializeAsString(); ++ ++#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ ++ OutType output; \ ++ const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ ++ SPP_SWIG_CHECK_AND_THROW; \ + return output; +-#endif + ++ ////////////////////////////////////////////////////////////// ++ // Handy methods that return the result directly. ++ // These functions ignore internal errors. + virtual std::vector EncodeAsPieces( + absl::string_view input) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector, input); +@@ -395,21 +481,21 @@ class SentencePieceProcessor { + + virtual std::vector, float>> + SampleEncodeAndScoreAsPieces(absl::string_view input, int num_samples, +- float theta, bool wor, bool include_best) const { ++ float alpha, bool wor, bool include_best) const { + using _T = std::vector, float>>; + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + } + + virtual std::vector, float>> + SampleEncodeAndScoreAsIds(absl::string_view input, int num_samples, +- float theta, bool wor, bool include_best) const { ++ float alpha, bool wor, bool include_best) const { + using _T = std::vector, float>>; + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + } + +- // TODO(taku): Remove this API and use std::vector ++ // DEPRECATED: Remove this API and use std::vector + virtual std::string DecodePieces( + const std::vector &pieces) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); +@@ -424,33 +510,104 @@ class SentencePieceProcessor { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids); + } + +- virtual float CalculateEntropy(absl::string_view text, float theta) const { +- DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, theta); ++ virtual float CalculateEntropy(absl::string_view text, float alpha) const { ++ DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, alpha); + } + +-#undef DEFINE_SPP_DIRECT_FUNC_IMPL +- ++ ////////////////////////////////////////////////////////////// ++ // SerializedProto API. (DEPRECATED). Use ImmutableProto API. + // They are used in Python interface. Returns serialized proto. + // In python module, we can get access to the full Proto after + // deserialzing the returned byte sequence. +- virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const; ++ virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(Encode, ImmutableSentencePieceText, input); ++ } + + virtual util::bytes SampleEncodeAsSerializedProto(absl::string_view input, + int nbest_size, +- float alpha) const; ++ float alpha) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText, ++ input, nbest_size, alpha); ++ } + + virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input, +- int nbest_size) const; ++ int nbest_size) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL( ++ NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size); ++ } ++ ++ virtual util::bytes SampleEncodeAndScoreAsSerializedProto( ++ absl::string_view input, int samples, float alpha, bool wor, ++ bool include_best, int nbest_size) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore, ++ ImmutableNBestSentencePieceText, input, ++ samples, alpha, wor, include_best); ++ } + + // TODO(taku): Remove this API and use std::vector + virtual util::bytes DecodePiecesAsSerializedProto( +- const std::vector &pieces) const; ++ const std::vector &pieces) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ++ pieces); ++ } + + virtual util::bytes DecodePiecesAsSerializedProto( +- const std::vector &pieces) const; ++ const std::vector &pieces) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ++ pieces); ++ } + + virtual util::bytes DecodeIdsAsSerializedProto( +- const std::vector &ids) const; ++ const std::vector &ids) const { ++ DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids); ++ } ++ ++ ////////////////////////////////////////////////////////////// ++ // ImmutableProto API. ++ virtual ImmutableSentencePieceText EncodeAsImmutableProto( ++ absl::string_view input) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Encode, ImmutableSentencePieceText, input); ++ } ++ ++ virtual ImmutableSentencePieceText SampleEncodeAsImmutableProto( ++ absl::string_view input, int nbest_size, float alpha) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText, ++ input, nbest_size, alpha); ++ } ++ ++ virtual ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto( ++ absl::string_view input, int nbest_size) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL( ++ NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size); ++ } ++ ++ virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto( ++ absl::string_view input, int samples, float alpha, bool wor, ++ bool include_best, int nbest_size) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore, ++ ImmutableNBestSentencePieceText, input, ++ samples, alpha, wor, include_best); ++ } ++ ++ // TODO(taku): Remove this API and use std::vector ++ virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto( ++ const std::vector &pieces) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces); ++ } ++ ++ virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto( ++ const std::vector &pieces) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces); ++ } ++ ++ virtual ImmutableSentencePieceText DecodeIdsAsImmutableProto( ++ const std::vector &ids) const { ++ DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids); ++ } ++ ++#undef DEFINE_SPP_DIRECT_FUNC_IMPL ++#undef DEFINE_SPP_SERIALIZED_PROTO_IMPL ++#undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL + + ////////////////////////////////////////////////////////////// + // Vocabulary management methods. +@@ -467,7 +624,8 @@ class SentencePieceProcessor { + virtual const std::string &IdToPiece(int id) const; + + // Returns the score of `id`. +- // Usually score is an emission log probability of unigram language model. ++ // Usually score is an emission log probability of unigram language ++ // model. + virtual float GetScore(int id) const; + + // Returns true if `id` is unknown symbol. +@@ -506,7 +664,7 @@ class SentencePieceProcessor { + + // Allows injection of a normalizer instance. `normalizer` is moved. + void SetNormalizer(std::unique_ptr &&normalizer); +-#endif ++#endif // SWIG + + // Returns immutable model proto. Useful to obtain extended + // or experimental parameters encoded in model_proto. +diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc +index d57ab5a..ed651f7 100644 +--- a/src/sentencepiece_processor_test.cc ++++ b/src/sentencepiece_processor_test.cc +@@ -12,6 +12,8 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + ++#include "sentencepiece_processor.h" ++ + #include + + #include "builder.h" +@@ -20,7 +22,6 @@ + #include "normalizer.h" + #include "sentencepiece.pb.h" + #include "sentencepiece_model.pb.h" +-#include "sentencepiece_processor.h" + #include "sentencepiece_trainer.h" + #include "testharness.h" + #include "third_party/absl/container/flat_hash_map.h" +@@ -551,10 +552,9 @@ TEST(SentencepieceProcessorTest, DecodeTest) { + int GetPieceSize() const override { return 7; } + + int PieceToId(absl::string_view piece) const override { +- static absl::flat_hash_map +- kMap = {{"", 0}, {"", 1}, {"", 2}, {WS "ABC", 3}, +- {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}}; ++ static absl::flat_hash_map kMap = { ++ {"", 0}, {"", 1}, {"", 2}, {WS "ABC", 3}, ++ {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}}; + return port::FindWithDefault(kMap, piece, 0); + } + +@@ -719,10 +719,9 @@ TEST(SentencepieceProcessorTest, DummyPrefixDecodeTest) { + int GetPieceSize() const override { return 7; } + + int PieceToId(absl::string_view piece) const override { +- static absl::flat_hash_map +- kMap = {{"", 0}, {"", 1}, {"", 2}, {WS "ABC", 3}, +- {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}, {WS, 7}}; ++ static absl::flat_hash_map kMap = { ++ {"", 0}, {"", 1}, {"", 2}, {WS "ABC", 3}, ++ {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}, {WS, 7}}; + return port::FindWithDefault(kMap, piece, 0); + } + +@@ -1058,18 +1057,6 @@ TEST(SentencePieceProcessorTest, EndToEndTest) { + EXPECT_EQ(2, sp.eos_id()); + EXPECT_EQ(-1, sp.pad_id()); + +- { +- // Verify the default encoder version. +- EXPECT_EQ(EncoderVersion::kOptimized, sp.GetEncoderVersion()); +- +- // Set the encoder version to original and verify. +- EXPECT_TRUE(sp.SetEncoderVersion(EncoderVersion::kOriginal).ok()); +- EXPECT_EQ(EncoderVersion::kOriginal, sp.GetEncoderVersion()); +- +- // Set back to the default encoder version. +- EXPECT_TRUE(sp.SetEncoderVersion(EncoderVersion::kOptimized).ok()); +- } +- + { + std::vector sps; + const std::vector expected_str = {WS, "ab", "c"}; +@@ -1574,4 +1561,77 @@ TEST(SentencePieceProcessorTest, VocabularyTest) { + EXPECT_FALSE(sp.IsUnused(6)); + EXPECT_FALSE(sp.IsUnused(7)); + } ++ ++TEST(SentencePieceProcessorTest, ImmutableSentencePieceTextTest) { ++ ImmutableSentencePieceText spt; ++ auto *v = spt.mutable_proto(); ++ ++ v->set_text("hello world"); ++ v->set_score(1.0); ++ for (int i = 0; i < 10; ++i) { ++ auto *p = v->add_pieces(); ++ p->set_surface(absl::StrCat("surface_", i)); ++ p->set_piece(absl::StrCat("surface_", i)); ++ p->set_id(i); ++ p->set_begin(i + 10); ++ p->set_end(i + 20); ++ } ++ ++ EXPECT_EQ(v->pieces_size(), spt.pieces_size()); ++ for (int i = 0; i < spt.pieces_size(); ++i) { ++ EXPECT_EQ(v->pieces(i).surface(), spt.pieces(i).surface()); ++ EXPECT_EQ(v->pieces(i).piece(), spt.pieces(i).piece()); ++ EXPECT_EQ(v->pieces(i).id(), spt.pieces(i).id()); ++ EXPECT_EQ(v->pieces(i).begin(), spt.pieces(i).begin()); ++ EXPECT_EQ(v->pieces(i).end(), spt.pieces(i).end()); ++ } ++ ++ int n = 0; ++ for (auto &p : spt.pieces()) { ++ EXPECT_EQ(v->pieces(n).surface(), p.surface()); ++ EXPECT_EQ(v->pieces(n).piece(), p.piece()); ++ EXPECT_EQ(v->pieces(n).id(), p.id()); ++ EXPECT_EQ(v->pieces(n).begin(), p.begin()); ++ EXPECT_EQ(v->pieces(n).end(), p.end()); ++ ++n; ++ } ++ ++ EXPECT_EQ(v->text(), spt.text()); ++ EXPECT_EQ(v->score(), spt.score()); ++ EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); ++ ++ // test copy. ++ auto spt2 = spt; ++ EXPECT_EQ(spt2.pieces_size(), spt.pieces_size()); ++ for (int i = 0; i < spt.pieces_size(); ++i) { ++ EXPECT_EQ(spt2.pieces(i).surface(), spt.pieces(i).surface()); ++ EXPECT_EQ(spt2.pieces(i).piece(), spt.pieces(i).piece()); ++ EXPECT_EQ(spt2.pieces(i).id(), spt.pieces(i).id()); ++ EXPECT_EQ(spt2.pieces(i).begin(), spt.pieces(i).begin()); ++ EXPECT_EQ(spt2.pieces(i).end(), spt.pieces(i).end()); ++ } ++} ++ ++TEST(SentencePieceProcessorTest, ImmutableNBestSentencePieceTextTest) { ++ ImmutableNBestSentencePieceText spt; ++ auto *v = spt.mutable_proto(); ++ for (int i = 0; i < 10; ++i) { ++ auto *p = v->add_nbests(); ++ p->set_text(absl::StrCat("text_", i)); ++ p->set_score(2.0 * i); ++ } ++ ++ EXPECT_EQ(v->nbests_size(), spt.nbests_size()); ++ for (int i = 0; i < v->nbests_size(); ++i) { ++ EXPECT_EQ(v->nbests(i).text(), spt.nbests(i).text()); ++ EXPECT_EQ(v->nbests(i).score(), spt.nbests(i).score()); ++ } ++ EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); ++ ++ // test copy. ++ auto spt2 = spt; ++ EXPECT_EQ(spt2.nbests_size(), spt.nbests_size()); ++ EXPECT_EQ(spt2.SerializeAsString(), spt.SerializeAsString()); ++} ++ + } // namespace sentencepiece +diff --git a/src/unigram_model.cc b/src/unigram_model.cc +index ea48912..d9f1ce9 100644 +--- a/src/unigram_model.cc ++++ b/src/unigram_model.cc +@@ -198,16 +198,17 @@ Lattice::LatticePathWithScore Lattice::Viterbi() { + return retval; + } + +-std::vector Lattice::ForwardAlgorithm(float theta) const { ++std::vector Lattice::ForwardAlgorithm(float inv_theta) const { + const int len = size(); + std::vector alpha(node_allocator_.size(), 0.0); + + for (int pos = 0; pos <= len; ++pos) { + for (Node *rnode : begin_nodes_[pos]) { + for (Node *lnode : end_nodes_[pos]) { +- alpha[rnode->node_id] = LogSumExp( +- alpha[rnode->node_id], theta * lnode->score + alpha[lnode->node_id], +- lnode == end_nodes_[pos][0]); ++ alpha[rnode->node_id] = ++ LogSumExp(alpha[rnode->node_id], ++ inv_theta * lnode->score + alpha[lnode->node_id], ++ lnode == end_nodes_[pos][0]); + } + } + } +@@ -215,7 +216,7 @@ std::vector Lattice::ForwardAlgorithm(float theta) const { + return alpha; + } + +-std::vector Lattice::BackwardAlgorithm(float theta) const { ++std::vector Lattice::BackwardAlgorithm(float inv_theta) const { + const int len = size(); + std::vector beta(node_allocator_.size(), 0.0); + +@@ -260,17 +261,16 @@ float Lattice::PopulateMarginal(float freq, + return freq * Z; + } + +-float Lattice::CalculateEntropy(float theta) const { ++float Lattice::CalculateEntropy(float inv_theta) const { + const int len = size(); + + // alpha[node_id] is the marginal prob of sequence up to start of node + // H is entropy of sequence + // the index of alpha/H is Node::node_id. +- std::vector alpha(node_allocator_.size(), 0.0); + std::vector H(node_allocator_.size(), 0.0); + + // Populate the forward marginals to get the normalising constant +- alpha = ForwardAlgorithm(theta); ++ const auto alpha = ForwardAlgorithm(inv_theta); + + // Now populate the forward entropies + for (int pos = 0; pos <= len; ++pos) { +@@ -280,7 +280,7 @@ float Lattice::CalculateEntropy(float theta) const { + + // We have to normalise p(lnode) by the marginal contribution it makes + const float lnode_transition_prob = +- ((theta * lnode->score) + alpha[lnode->node_id] - ++ ((inv_theta * lnode->score) + alpha[lnode->node_id] - + alpha[rnode->node_id]); + H[rnode->node_id] += std::exp(lnode_transition_prob) * + (H[lnode->node_id] + lnode_transition_prob); +@@ -345,7 +345,7 @@ Hypothesis *CloneHypAndDependents( + + std::vector Lattice::NBest(size_t nbest_size, + bool sample, +- float theta) { ++ float inv_theta) { + if (nbest_size < 1) { + LOG(WARNING) << "nbest_size >= 1. Returns empty result."; + return {}; +@@ -391,7 +391,7 @@ std::vector Lattice::NBest(size_t nbest_size, + + if (sample) { + // Run forwards algorithm to get normalising constants +- alpha = ForwardAlgorithm(theta); ++ alpha = ForwardAlgorithm(inv_theta); + // f(eos) = Gumbel(0), as it is the perturbed score of the entire lattice. + eos->fx = Gumbel(); + } else { +@@ -432,7 +432,8 @@ std::vector Lattice::NBest(size_t nbest_size, + for (int i = 0; i < end_nodes(node->pos).size(); i++) { + Node *lnode = end_nodes(node->pos)[i]; + // Calculate backwards transition score +- probs[i] = top->gx + alpha[lnode->node_id] + (theta * lnode->score) - Z; ++ probs[i] = ++ top->gx + alpha[lnode->node_id] + (inv_theta * lnode->score) - Z; + perturbed_probs[i] = probs[i] + Gumbel(); + if (perturbed_probs[i] > max_score) { + max_score = perturbed_probs[i]; +@@ -508,13 +509,13 @@ std::vector Lattice::NBest(size_t nbest_size, + return results; + } + +-std::vector Lattice::Sample(float theta) { ++std::vector Lattice::Sample(float inv_theta) { + const int len = size(); + if (len == 0) return {}; + + std::vector alpha(node_allocator_.size(), 0.0); + +- alpha = ForwardAlgorithm(theta); ++ alpha = ForwardAlgorithm(inv_theta); + + auto *mt = random::GetRandomGenerator(); + +@@ -526,8 +527,8 @@ std::vector Lattice::Sample(float theta) { + while (true) { + probs.clear(); + for (const Node *lnode : end_nodes_[node->pos]) { +- probs.push_back(std::exp(static_cast(alpha[lnode->node_id] + +- theta * lnode->score - Z))); ++ probs.push_back(std::exp(static_cast( ++ alpha[lnode->node_id] + inv_theta * lnode->score - Z))); + } + std::discrete_distribution dist(probs.begin(), probs.end()); + node = end_nodes_[node->pos][dist(*mt)]; +@@ -721,7 +722,7 @@ NBestEncodeResult Model::NBestEncode(absl::string_view normalized, + } + + EncodeResult Model::SampleEncode(absl::string_view normalized, +- float theta) const { ++ float inv_theta) const { + if (!status().ok() || normalized.empty()) { + return {}; + } +@@ -731,7 +732,7 @@ EncodeResult Model::SampleEncode(absl::string_view normalized, + PopulateNodes(&lattice); + + EncodeResult results; +- for (const auto *node : lattice.Sample(theta)) { ++ for (const auto *node : lattice.Sample(inv_theta)) { + results.emplace_back(node->piece, node->id); + } + +@@ -739,7 +740,7 @@ EncodeResult Model::SampleEncode(absl::string_view normalized, + } + + NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, +- float theta, int samples, ++ float inv_theta, int samples, + bool wor, + bool include_best) const { + if (!status().ok() || normalized.empty()) { +@@ -750,16 +751,16 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + lattice.SetSentence(normalized); + PopulateNodes(&lattice); + +- std::vector alpha = lattice.ForwardAlgorithm(theta); +- float marginal = alpha[lattice.eos_node()->node_id]; ++ const std::vector alpha = lattice.ForwardAlgorithm(inv_theta); ++ const float marginal = alpha[lattice.eos_node()->node_id]; + + if (include_best) { + if (!wor) { +- LOG(FATAL) << "include_best not supported for wor false"; ++ LOG(ERROR) << "include_best not supported for wor false"; ++ return {}; + } + EncodeResult result; +- Lattice::LatticePathWithScore best_path = lattice.Viterbi(); +- ++ const auto best_path = lattice.Viterbi(); + for (const auto *node : best_path.first) { + result.emplace_back(node->piece, node->id); + } +@@ -770,8 +771,7 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + + if (wor) { + // Draw k+1 samples as we need perturbed score of k+1th element +- std::vector nbest_samples = +- lattice.NBest(samples + 1, true, theta); ++ auto nbest_samples = lattice.NBest(samples + 1, true, inv_theta); + + if (include_best) { + std::vector> nbest_paths( +@@ -780,14 +780,13 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + nbest_paths[i] = nbest_samples[i].first; + } + // Remove the best result from the samples if necessary +- Lattice::LatticePathWithScore best_path = lattice.Viterbi(); ++ const auto best_path = lattice.Viterbi(); + + const int index_of_best = + (std::find(nbest_paths.begin(), nbest_paths.end(), best_path.first) - + nbest_paths.begin()); + + if (index_of_best != nbest_samples.size()) { +- LOG(INFO) << "removing best path from samples"; + nbest_samples.erase(nbest_samples.begin() + index_of_best); + } else { + nbest_samples.pop_back(); +@@ -803,7 +802,7 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + float score = 0.0; + + for (const auto *node : nbest.first) { +- score += (theta * node->score); ++ score += (inv_theta * node->score); + result.emplace_back(node->piece, node->id); + } + +@@ -814,8 +813,8 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + for (auto &it : results) { + // Only modify non best sample inclusion probabilities. + if (it.second != 0.0) { +- double x = it.second - kappa; +- double y = std::exp(x); ++ const double x = it.second - kappa; ++ const double y = std::exp(x); + double inclusion_prob; + if (x <= -10) { + // Series expansion of the log Gumbel survival function up to eps. +@@ -835,10 +834,10 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + + float score = 0.0; + EncodeResult result; +- std::vector sample = lattice.Sample(theta); ++ const std::vector sample = lattice.Sample(inv_theta); + for (const auto *node : sample) { + result.emplace_back(node->piece, node->id); +- score += (theta * node->score); ++ score += (inv_theta * node->score); + } + results.emplace_back(result, score - marginal); + } +@@ -847,12 +846,13 @@ NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized, + return results; + } + +-float Model::CalculateEntropy(absl::string_view normalized, float theta) const { ++float Model::CalculateEntropy(absl::string_view normalized, ++ float inv_theta) const { + Lattice lattice; + lattice.SetSentence(normalized); + PopulateNodes(&lattice); + +- return lattice.CalculateEntropy(theta); ++ return lattice.CalculateEntropy(inv_theta); + } + + bool Model::VerifyOutputsEquivalent(absl::string_view expected, +diff --git a/src/unigram_model.h b/src/unigram_model.h +index 448e489..aa4f28f 100644 +--- a/src/unigram_model.h ++++ b/src/unigram_model.h +@@ -173,6 +173,18 @@ class Model : public ModelInterface { + bool VerifyOutputsEquivalent(absl::string_view expected, + absl::string_view actual) const override; + ++ enum EncoderVersion { ++ kOptimized, // The optimized encoder. ++ kOriginal // The original encoder. ++ }; ++ ++ void SetEncoderVersion(EncoderVersion encoder_version) { ++ encoder_version_ = encoder_version; ++ } ++ ++ // Returns the current encoder version in use. ++ EncoderVersion GetEncoderVersion() const { return encoder_version_; } ++ + protected: + // Builds a Trie index. + void BuildTrie(std::vector> *pieces); +@@ -195,6 +207,9 @@ class Model : public ModelInterface { + // Maximum size of the return value of Trie, which corresponds + // to the maximum size of shared common prefix in the sentence pieces. + int trie_results_size_; ++ ++ // encoder version. ++ EncoderVersion encoder_version_ = kOptimized; + }; + + } // namespace unigram +diff --git a/src/unigram_model_test.cc b/src/unigram_model_test.cc +index 8049d20..221bac2 100644 +--- a/src/unigram_model_test.cc ++++ b/src/unigram_model_test.cc +@@ -12,6 +12,8 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + ++#include "unigram_model.h" ++ + #include + #include + #include +@@ -22,7 +24,6 @@ + #include "testharness.h" + #include "third_party/absl/strings/str_cat.h" + #include "third_party/absl/strings/str_join.h" +-#include "unigram_model.h" + #include "util.h" + + namespace sentencepiece { +@@ -249,14 +250,14 @@ TEST(LatticeTest, NBestSampleTest) { + + // Calculate expected probabilities of each path + // Note that sampling without replacement affects the expected frequencies! +- const std::vector kTheta = {0.0, 0.01, 0.5, 0.7, 1.0}; +- for (const auto theta : kTheta) { ++ const std::vector kInv_Theta = {0.0, 0.01, 0.5, 0.7, 1.0}; ++ for (const auto inv_theta : kInv_Theta) { + std::vector strings = {"ABC", "AB C", "A BC", "A B C"}; + std::map probs; +- probs["ABC"] = std::exp(theta * 1.0); +- probs["AB C"] = std::exp(theta * (0.2 + 0.1)); +- probs["A BC"] = std::exp(theta * (0.0 + 0.5)); +- probs["A B C"] = std::exp(theta * (0.0 + 0.0 + 0.1)); ++ probs["ABC"] = std::exp(inv_theta * 1.0); ++ probs["AB C"] = std::exp(inv_theta * (0.2 + 0.1)); ++ probs["A BC"] = std::exp(inv_theta * (0.0 + 0.5)); ++ probs["A B C"] = std::exp(inv_theta * (0.0 + 0.0 + 0.1)); + + for (const auto &it : strings) { + EXPECT_EQ(1, probs.count(it)); +@@ -298,7 +299,7 @@ TEST(LatticeTest, NBestSampleTest) { + for (const auto num_samples : kNumSamples) { + std::map counts; + for (int i = 0; i < kTrials; i++) { +- auto nbests = lattice.NBest(num_samples, true, theta); ++ auto nbests = lattice.NBest(num_samples, true, inv_theta); + for (const auto &nbest : nbests) { + counts[GetTokenized(nbest.first)]++; + } +@@ -329,14 +330,14 @@ TEST(LatticeTest, CalculateEntropyTest) { + InsertWithScore(&lattice, 0, 3, 1.0); // ABC + + // Calculate expected probabilities of each path +- const std::vector kTheta = {0.0, 0.01, 0.5, 0.7, 1.0}; +- for (const auto theta : kTheta) { ++ const std::vector kInv_Theta = {0.0, 0.01, 0.5, 0.7, 1.0}; ++ for (const auto inv_theta : kInv_Theta) { + std::vector strings = {"ABC", "AB C", "A BC", "A B C"}; + std::map probs; +- probs["ABC"] = std::exp(theta * 1.0); +- probs["AB C"] = std::exp(theta * (0.2 + 0.1)); +- probs["A BC"] = std::exp(theta * (0.0 + 0.5)); +- probs["A B C"] = std::exp(theta * (0.0 + 0.0 + 0.1)); ++ probs["ABC"] = std::exp(inv_theta * 1.0); ++ probs["AB C"] = std::exp(inv_theta * (0.2 + 0.1)); ++ probs["A BC"] = std::exp(inv_theta * (0.0 + 0.5)); ++ probs["A B C"] = std::exp(inv_theta * (0.0 + 0.0 + 0.1)); + + double Z = 0.0; + for (const auto &it : probs) Z += it.second; +@@ -349,7 +350,7 @@ TEST(LatticeTest, CalculateEntropyTest) { + for (const auto &it : probs) { + entropy += (it.second * std::log(it.second)); + } +- EXPECT_NEAR(-entropy, lattice.CalculateEntropy(theta), 0.02); ++ EXPECT_NEAR(-entropy, lattice.CalculateEntropy(inv_theta), 0.02); + } + } + +@@ -364,9 +365,9 @@ TEST(LatticeTest, ForwardAlgorithmTest) { + InsertWithScore(&lattice, 1, 2, 0.5); // BC + InsertWithScore(&lattice, 0, 3, 1.0); // ABC + +- const std::vector kTheta = {0.0, 0.01, 0.5, 0.7, 1.0}; +- for (const auto theta : kTheta) { +- std::vector alpha = lattice.ForwardAlgorithm(theta); ++ const std::vector kInv_Theta = {0.0, 0.01, 0.5, 0.7, 1.0}; ++ for (const auto inv_theta : kInv_Theta) { ++ std::vector alpha = lattice.ForwardAlgorithm(inv_theta); + EXPECT_EQ(alpha.size(), 8); // 6 nodes, plus BOS, EOS + // only alpha[C], alpha[EOS] have non-zero alpha + for (int i : {0, 1, 2, 3}) { +@@ -374,14 +375,15 @@ TEST(LatticeTest, ForwardAlgorithmTest) { + if (i < 2) { + EXPECT_EQ(alpha[node->node_id], 0.0); + } else if (i == 2) { +- float Z = +- std::log(std::exp(theta * (0.0 + 0.0)) + std::exp(theta * 0.2)); ++ float Z = std::log(std::exp(inv_theta * (0.0 + 0.0)) + ++ std::exp(inv_theta * 0.2)); + EXPECT_EQ(alpha[node->node_id], Z); + } else if (i == 3) { +- float Z = std::log(std::exp(theta * (0.0 + 0.0 + 0.1)) + // A + B + C +- std::exp(theta * (0.2 + 0.1)) + // AB + C +- std::exp(theta * (0.0 + 0.5)) + // A + BC +- std::exp(theta * 1.0)); // ABC ++ float Z = ++ std::log(std::exp(inv_theta * (0.0 + 0.0 + 0.1)) + // A + B + C ++ std::exp(inv_theta * (0.2 + 0.1)) + // AB + C ++ std::exp(inv_theta * (0.0 + 0.5)) + // A + BC ++ std::exp(inv_theta * 1.0)); // ABC + EXPECT_EQ(Z, alpha[node->node_id]); + } + } +@@ -435,14 +437,14 @@ TEST(LatticeTest, SampleTest) { + InsertWithScoreAndId(&lattice, 1, 2, 1.7, 4); // BC + InsertWithScoreAndId(&lattice, 0, 3, 1.8, 5); // ABC + +- const std::vector kTheta = {0.0, 0.01, 0.5, 0.7, 1.0}; +- for (int i = 0; i < kTheta.size(); ++i) { ++ const std::vector kInv_Theta = {0.0, 0.01, 0.5, 0.7, 1.0}; ++ for (int i = 0; i < kInv_Theta.size(); ++i) { + std::map probs; + // Expands all paths in the lattice. +- probs["A B C"] = exp(kTheta[i] * (1.0 + 1.2 + 1.5)); // A B C +- probs["AB C"] = exp(kTheta[i] * (1.6 + 1.5)); // AB C +- probs["A BC"] = exp(kTheta[i] * (1.0 + 1.7)); // A BC +- probs["ABC"] = exp(kTheta[i] * 1.8); // ABC ++ probs["A B C"] = exp(kInv_Theta[i] * (1.0 + 1.2 + 1.5)); // A B C ++ probs["AB C"] = exp(kInv_Theta[i] * (1.6 + 1.5)); // AB C ++ probs["A BC"] = exp(kInv_Theta[i] * (1.0 + 1.7)); // A BC ++ probs["ABC"] = exp(kInv_Theta[i] * 1.8); // ABC + + // Computes expected probabilities. + double Z = 0.0; +@@ -453,7 +455,7 @@ TEST(LatticeTest, SampleTest) { + constexpr int kTrial = 100000; + std::map freq; + for (int n = 0; n < kTrial; ++n) { +- freq[GetTokenized(lattice.Sample(kTheta[i]))]++; ++ freq[GetTokenized(lattice.Sample(kInv_Theta[i]))]++; + } + + EXPECT_EQ(probs.size(), freq.size()); +@@ -480,18 +482,18 @@ ModelProto MakeBaseModelProto() { + } + + // Returns model protos in parameterized tests. +-const std::vector &GetEncoderVersions() { +- static const std::vector &v = +- *new std::vector{EncoderVersion::kOptimized, +- EncoderVersion::kOriginal}; ++const std::vector &GetEncoderVersions() { ++ static const std::vector &v = ++ *new std::vector{Model::kOptimized, ++ Model::kOriginal}; + return v; + } + +-class UnigramModelTest : public test::TestWithParam { ++class UnigramModelTest : public test::TestWithParam { + protected: + void SetUp() override { encoder_version_ = GetParam(); } + void TearDown() override {} +- EncoderVersion encoder_version_; ++ Model::EncoderVersion encoder_version_; + }; + + void AddPiece(ModelProto *model_proto, const std::string &piece, +@@ -530,15 +532,15 @@ TEST(UnigramModelTest, SampleEncodeAndScoreTest) { + lattice.SetSentence("ABC"); + model.PopulateNodes(&lattice); + +- std::vector kTheta = {0.0, 1.0}; ++ std::vector kInv_Theta = {0.0, 1.0}; + +- for (const auto theta : kTheta) { ++ for (const auto inv_theta : kInv_Theta) { + std::vector strings = {"ABC", "AB C", "A BC", "A B C"}; + std::map probs; +- probs["ABC"] = std::exp(theta * 1.0); +- probs["AB C"] = std::exp(theta * (0.2 + 0.1)); +- probs["A BC"] = std::exp(theta * (0.0 + 0.5)); +- probs["A B C"] = std::exp(theta * (0.0 + 0.0 + 0.1)); ++ probs["ABC"] = std::exp(inv_theta * 1.0); ++ probs["AB C"] = std::exp(inv_theta * (0.2 + 0.1)); ++ probs["A BC"] = std::exp(inv_theta * (0.0 + 0.5)); ++ probs["A B C"] = std::exp(inv_theta * (0.0 + 0.0 + 0.1)); + + for (const auto &it : strings) { + EXPECT_EQ(1, probs.count(it)); +@@ -579,8 +581,8 @@ TEST(UnigramModelTest, SampleEncodeAndScoreTest) { + std::map scores; + int kTrials = 50000; + for (int i = 0; i < kTrials; i++) { +- NBestEncodeResult sample = +- model.SampleEncodeAndScore("ABC", theta, num_samples, true, false); ++ NBestEncodeResult sample = model.SampleEncodeAndScore( ++ "ABC", inv_theta, num_samples, true, false); + + for (const auto &it : sample) { + std::vector tokens; +@@ -619,7 +621,7 @@ TEST_P(UnigramModelTest, PieceToIdTest) { + AddPiece(&model_proto, "d", 0.4); + + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + EXPECT_EQ(model_proto.SerializeAsString(), + model.model_proto().SerializeAsString()); +@@ -677,7 +679,7 @@ TEST_P(UnigramModelTest, PopulateNodesAllUnknownsTest) { + ModelProto model_proto = MakeBaseModelProto(); + AddPiece(&model_proto, "x"); + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + Lattice lattice; + lattice.SetSentence("abc"); +@@ -701,7 +703,7 @@ TEST_P(UnigramModelTest, PopulateNodesTest) { + AddPiece(&model_proto, "bc", 0.4); // 6 + + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + Lattice lattice; + lattice.SetSentence("abc"); +@@ -736,7 +738,7 @@ TEST_P(UnigramModelTest, PopulateNodesWithUnusedTest) { + model_proto.mutable_pieces(6)->set_type(ModelProto::SentencePiece::UNUSED); + + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + Lattice lattice; + lattice.SetSentence("abc"); +@@ -761,7 +763,7 @@ TEST_P(UnigramModelTest, ModelNBestTest) { + AddPiece(&model_proto, "abc", 10.0); // 8 + + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + auto nbest = model.NBestEncode("", 10); + EXPECT_EQ(1, nbest.size()); +@@ -800,7 +802,7 @@ TEST_P(UnigramModelTest, EncodeTest) { + ModelProto::SentencePiece::USER_DEFINED); + + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + + EncodeResult result; + +@@ -883,7 +885,7 @@ TEST_P(UnigramModelTest, EncodeWithUnusedTest) { + // No unused. + { + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + const auto result = model.Encode("abcd"); + EXPECT_EQ(1, result.size()); + EXPECT_EQ("abcd", result[0].first); +@@ -892,7 +894,7 @@ TEST_P(UnigramModelTest, EncodeWithUnusedTest) { + { + model_proto.mutable_pieces(3)->set_type(ModelProto::SentencePiece::UNUSED); + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + const auto result = model.Encode("abcd"); + EXPECT_EQ(2, result.size()); + EXPECT_EQ("abc", result[0].first); +@@ -903,7 +905,7 @@ TEST_P(UnigramModelTest, EncodeWithUnusedTest) { + model_proto.mutable_pieces(3)->set_type(ModelProto::SentencePiece::UNUSED); + model_proto.mutable_pieces(5)->set_type(ModelProto::SentencePiece::UNUSED); + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + const auto result = model.Encode("abcd"); + EXPECT_EQ(2, result.size()); + EXPECT_EQ("abc", result[0].first); +@@ -917,7 +919,7 @@ TEST_P(UnigramModelTest, EncodeWithUnusedTest) { + model_proto.mutable_pieces(4)->set_type(ModelProto::SentencePiece::UNUSED); + model_proto.mutable_pieces(5)->set_type(ModelProto::SentencePiece::NORMAL); + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + const auto result = model.Encode("abcd"); + EXPECT_EQ(2, result.size()); + EXPECT_EQ("ab", result[0].first); +@@ -937,7 +939,7 @@ TEST_P(UnigramModelTest, VerifyOutputsEquivalent) { + AddPiece(&model_proto, "c", 2.0); // 9 + AddPiece(&model_proto, "d", 1.0); // 10 + Model model(model_proto); +- EXPECT_TRUE(model.SetEncoderVersion(encoder_version_).ok()); ++ model.SetEncoderVersion(encoder_version_); + // Equivalent outputs. + EXPECT_TRUE(model.VerifyOutputsEquivalent("", "")); + EXPECT_TRUE(model.VerifyOutputsEquivalent("a b", "a b")); +diff --git a/src/util.h b/src/util.h +index 285676d..fb312f1 100644 +--- a/src/util.h ++++ b/src/util.h +@@ -60,17 +60,6 @@ uint32 GetRandomGeneratorSeed(); + // String utilities + namespace string_util { + +-struct string_view_hash { +- // DJB hash function. +- inline size_t operator()(const absl::string_view &sp) const { +- size_t hash = 5381; +- for (size_t i = 0; i < sp.size(); ++i) { +- hash = ((hash << 5) + hash) + sp[i]; +- } +- return hash; +- } +-}; +- + template + inline bool lexical_cast(absl::string_view arg, Target *result) { + std::stringstream ss; diff --git a/patches/0011-add-verbose-option.patch b/patches/0011-add-verbose-option.patch new file mode 100644 index 0000000..35f8025 --- /dev/null +++ b/patches/0011-add-verbose-option.patch @@ -0,0 +1,172 @@ +From: Taku Kudo +Date: Mon, 20 Jun 2022 01:35:11 +0900 +Subject: add verbose option + +Signed-off-by: Kentaro Hayashi +--- + .github/workflows/cmake.yml | 2 +- + src/common.h | 13 ------------- + src/normalizer.cc | 7 +++---- + src/sentencepiece_processor.cc | 10 ++++++---- + src/sentencepiece_processor.h | 9 +++------ + src/util.h | 1 - + 6 files changed, 13 insertions(+), 29 deletions(-) + +diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml +index 7f19083..5108074 100644 +--- a/.github/workflows/cmake.yml ++++ b/.github/workflows/cmake.yml +@@ -45,7 +45,7 @@ jobs: + + - name: Test + working-directory: ${{github.workspace}}/build +- run: ctest -C Release ++ run: ctest -C Release --output-on-failure + + - name: Package + working-directory: ${{github.workspace}}/build +diff --git a/src/common.h b/src/common.h +index c27c352..ba951d6 100644 +--- a/src/common.h ++++ b/src/common.h +@@ -98,15 +98,6 @@ class Die { + private: + bool die_; + }; +- +-template +-T &&CheckNotNull(const char *file, int line, const char *exprtext, T &&t) { +- if (t == nullptr) { +- std::cerr << file << "(" << line << ") " << exprtext; +- Abort(); +- } +- return std::forward(t); +-} + } // namespace error + + namespace logging { +@@ -158,10 +149,6 @@ inline const char *BaseName(const char *path) { + #define CHECK_LE(a, b) CHECK((a) <= (b)) + #define CHECK_GT(a, b) CHECK((a) > (b)) + #define CHECK_LT(a, b) CHECK((a) < (b)) +-#define CHECK_NOTNULL(val) \ +- ::sentencepiece::error::CheckNotNull( \ +- ::sentencepiece::logging::BaseName(__FILE__), __LINE__, \ +- "'" #val "' Must be non NULL", (val)) + + #define FRIEND_TEST(a, b) friend class a##_Test_##b; + +diff --git a/src/normalizer.cc b/src/normalizer.cc +index d87f89b..2ab8084 100644 +--- a/src/normalizer.cc ++++ b/src/normalizer.cc +@@ -12,11 +12,12 @@ + // See the License for the specific language governing permissions and + // limitations under the License.! + ++#include "normalizer.h" ++ + #include + #include + + #include "common.h" +-#include "normalizer.h" + #include "third_party/absl/memory/memory.h" + #include "third_party/absl/strings/match.h" + #include "third_party/absl/strings/string_view.h" +@@ -46,9 +47,7 @@ Normalizer::~Normalizer() {} + + void Normalizer::Init() { + absl::string_view index = spec_->precompiled_charsmap(); +- if (index.empty()) { +- LOG(INFO) << "precompiled_charsmap is empty. use identity normalization."; +- } else { ++ if (!index.empty()) { + absl::string_view trie_blob, normalized; + #ifdef IS_BIG_ENDIAN + status_ = DecodePrecompiledCharsMap(index, &trie_blob, &normalized, +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index a6f5395..805e0f9 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -67,12 +67,12 @@ ImmutableSentencePieceText::ImmutableSentencePiece::ImmutableSentencePiece( + const SentencePieceText_SentencePiece &sp) + : sp_(&sp) {} + +-absl::string_view ImmutableSentencePieceText::ImmutableSentencePiece::piece() ++const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::piece() + const { + return sp_->piece(); + } + +-absl::string_view ImmutableSentencePieceText::ImmutableSentencePiece::surface() ++const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::surface() + const { + return sp_->surface(); + } +@@ -109,8 +109,10 @@ ImmutableSentencePieceText::pieces(int index) const { + spt_->pieces(index)); + } + +-absl::string_view ImmutableSentencePieceText::text() const { +- return spt_ ? spt_->text() : ""; ++const std::string &ImmutableSentencePieceText::text() const { ++ if (spt_) return spt_->text(); ++ static std::string *kEmptyString = new std::string(); ++ return *kEmptyString; + } + + float ImmutableSentencePieceText::score() const { +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index 51c5b3b..8124c59 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -165,8 +165,8 @@ class ImmutableSentencePieceText { + class ImmutableSentencePiece { + public: + ~ImmutableSentencePiece() = default; +- absl::string_view piece() const; +- absl::string_view surface() const; ++ const std::string &piece() const; ++ const std::string &surface() const; + uint32_t id() const; + uint32_t begin() const; + uint32_t end() const; +@@ -182,7 +182,7 @@ class ImmutableSentencePieceText { + std::vector pieces() const; + size_t pieces_size() const; + ImmutableSentencePiece pieces(int index) const; +- absl::string_view text() const; ++ const std::string &text() const; + float score() const; + + std::string SerializeAsString() const; +@@ -193,7 +193,6 @@ class ImmutableSentencePieceText { + SentencePieceText *mutable_proto(); + + friend class ImmutableNBestSentencePieceText; +- friend class SentencePieceProcessor; + + private: + explicit ImmutableSentencePieceText(const SentencePieceText &spt); +@@ -222,8 +221,6 @@ class ImmutableNBestSentencePieceText { + // it returns the raw pointer managed by the shared_ptr. + NBestSentencePieceText *mutable_proto(); + +- friend class SentencePieceProcessor; +- + private: + std::shared_ptr rep_; + }; +diff --git a/src/util.h b/src/util.h +index fb312f1..01a561f 100644 +--- a/src/util.h ++++ b/src/util.h +@@ -94,7 +94,6 @@ inline bool lexical_cast(absl::string_view arg, std::string *result) { + + template + inline bool DecodePOD(absl::string_view str, T *result) { +- CHECK_NOTNULL(result); + if (sizeof(*result) != str.size()) { + return false; + } diff --git a/patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch b/patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch new file mode 100644 index 0000000..009b039 --- /dev/null +++ b/patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch @@ -0,0 +1,4346 @@ +From: Taku Kudo +Date: Mon, 1 Aug 2022 17:19:09 +0900 +Subject: Supports ImmutableSentencePieceText from python module + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/__init__.py | 228 ++- + python/src/sentencepiece/sentencepiece.i | 310 ++- + python/src/sentencepiece/sentencepiece_wrap.cxx | 2310 ++++++++++++++++++----- + python/test/sentencepiece_test.py | 62 +- + src/sentencepiece_processor.cc | 87 +- + src/sentencepiece_processor.h | 61 +- + src/sentencepiece_processor_test.cc | 137 +- + 7 files changed, 2524 insertions(+), 671 deletions(-) + +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index 1543d32..69a9825 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -61,6 +61,98 @@ class _SwigNonDynamicMeta(type): + __setattr__ = _swig_setattr_nondynamic_class_variable(type.__setattr__) + + ++class ImmutableSentencePieceText_ImmutableSentencePiece(object): ++ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") ++ __repr__ = _swig_repr ++ ++ def __init__(self): ++ _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece()) ++ __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece ++ ++ def piece(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_piece(self) ++ ++ def surface(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_surface(self) ++ ++ def id(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_id(self) ++ ++ def begin(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_begin(self) ++ ++ def end(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_end(self) ++ ++# Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece: ++_sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) ++ ++class ImmutableSentencePieceText(object): ++ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") ++ __repr__ = _swig_repr ++ ++ def __init__(self): ++ _sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText()) ++ __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText ++ ++ def pieces_size(self): ++ return _sentencepiece.ImmutableSentencePieceText_pieces_size(self) ++ ++ def text(self): ++ return _sentencepiece.ImmutableSentencePieceText_text(self) ++ ++ def score(self): ++ return _sentencepiece.ImmutableSentencePieceText_score(self) ++ ++ def SerializeAsString(self): ++ return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self) ++ ++ def pieces(self, index): ++ return _sentencepiece.ImmutableSentencePieceText_pieces(self, index) ++ ++ def __len__(self): ++ return self.pieces_size() ++ ++ def __getitem__(self, i): ++ return self.pieces(i) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() ++ ++ ++# Register ImmutableSentencePieceText in _sentencepiece: ++_sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) ++ ++class ImmutableNBestSentencePieceText(object): ++ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") ++ __repr__ = _swig_repr ++ ++ def __init__(self): ++ _sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText()) ++ __swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText ++ ++ def nbests_size(self): ++ return _sentencepiece.ImmutableNBestSentencePieceText_nbests_size(self) ++ ++ def SerializeAsString(self): ++ return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self) ++ ++ def nbests(self, index): ++ return _sentencepiece.ImmutableNBestSentencePieceText_nbests(self, index) ++ ++ def __len__(self): ++ return self.nbests_size() ++ ++ def __getitem__(self, i): ++ return self.nbests(i) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() ++ ++ ++# Register ImmutableNBestSentencePieceText in _sentencepiece: ++_sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) ++ + class SentencePieceProcessor(object): + thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") + __repr__ = _swig_repr +@@ -87,12 +179,6 @@ class SentencePieceProcessor(object): + def LoadVocabulary(self, filename, threshold): + return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold) + +- def SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best): +- return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best) +- +- def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): +- return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) +- + def CalculateEntropy(self, *args): + return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args) + +@@ -147,6 +233,9 @@ class SentencePieceProcessor(object): + def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + ++ def _EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ + def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + +@@ -156,6 +245,9 @@ class SentencePieceProcessor(object): + def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + ++ def _EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ + def _DecodeIds(self, ids): + return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids) + +@@ -168,6 +260,12 @@ class SentencePieceProcessor(object): + def _DecodePiecesAsSerializedProto(self, pieces): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) + ++ def _DecodeIdsAsImmutableProto(self, ids): ++ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids) ++ ++ def _DecodePiecesAsImmutableProto(self, pieces): ++ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces) ++ + def _DecodeIdsBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) + +@@ -180,6 +278,9 @@ class SentencePieceProcessor(object): + def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) + ++ def _DecodePiecesAsImmutableProtoBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads) ++ + def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + +@@ -189,17 +290,26 @@ class SentencePieceProcessor(object): + def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + +- def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): +- return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) ++ def _NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + +- def _SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): +- return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) ++ def _SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + +- def _CalculateEntropy(self, text, theta): +- return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, theta) ++ def _SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + +- def _CalculateEntropyBatch(self, ins, theta, num_threads): +- return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, theta, num_threads) ++ def _SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): ++ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) ++ ++ def _CalculateEntropy(self, text, alpha): ++ return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, alpha) ++ ++ def _CalculateEntropyBatch(self, ins, alpha, num_threads): ++ return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads) + + def Init(self, + model_file=None, +@@ -319,9 +429,12 @@ class SentencePieceProcessor(object): + if out_type is str: + return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) + + if out_type is int: + return self._EncodeAsIds(input, enable_sampling, nbest_size, +@@ -329,9 +442,12 @@ class SentencePieceProcessor(object): + if out_type is str: + return self._EncodeAsPieces(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) + + raise RuntimeError('unknown out_type={}'.format(out_type)) + return None +@@ -346,7 +462,11 @@ class SentencePieceProcessor(object): + + + def EncodeAsSerializedProto(self, input, **kwargs): +- return self.Encode(input=input, out_type='proto', **kwargs) ++ return self.Encode(input=input, out_type='serialized_proto', **kwargs) ++ ++ ++ def EncodeAsImmutableProto(self, input, **kwargs): ++ return self.Encode(input=input, out_type='immutable_proto', **kwargs) + + + def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): +@@ -361,7 +481,12 @@ class SentencePieceProcessor(object): + + def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, +- out_type='proto', enable_sampling=True, **kwargs) ++ out_type='serialized_proto', enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type='immutable_proto', enable_sampling=True, **kwargs) + + + def NBestEncode(self, +@@ -407,9 +532,12 @@ class SentencePieceProcessor(object): + if out_type is str: + return self._NBestEncodeAsPieces(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._NBestEncodeAsSerializedProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._NBestEncodeAsImmutableProto(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: + return [_encode(n) for n in input] +@@ -429,7 +557,12 @@ class SentencePieceProcessor(object): + + def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, +- out_type='proto', **kwargs) ++ out_type='serialized_proto', **kwargs) ++ ++ ++ def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type='immutable_proto', **kwargs) + + + def SampleEncodeAndScore(self, +@@ -440,20 +573,20 @@ class SentencePieceProcessor(object): + reverse=None, + emit_unk_piece=None, + num_samples=None, +- theta=None, ++ alpha=None, + wor=None, + include_best=None): + """SampleEncodeAndScore text input to segmented ids or tokens. + + Args: + input: input string. accepsts list of string. +- out_type: output type. int or str or 'proto'. ++ out_type: output type. int or str or 'serialized_proto' or 'immutable_proto' + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) + num_samples: How many samples to return (Default = 1) +- theta: inverse temperature for sampling ++ alpha: inverse temperature for sampling + wor: whether to sample without replacement (Default = false) + include_best: whether to include the best tokenization, requires wor=True (Default = false) + """ +@@ -470,8 +603,8 @@ class SentencePieceProcessor(object): + emit_unk_piece = self._emit_unk_piece + if num_samples is None: + num_samples = 1 +- if theta is None: +- theta = 1. ++ if alpha is None: ++ alpha = 1. + if wor is None: + wor = False + if include_best is None: +@@ -486,10 +619,10 @@ class SentencePieceProcessor(object): + + def _encode(text): + if out_type is int: +- return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, ++ return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + else: +- return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, ++ return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: +@@ -502,7 +635,7 @@ class SentencePieceProcessor(object): + """Decode processed id or token sequences. + + Args: +- out_type: output type. str or 'proto' (Default = str) ++ out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) + num_threads: the number of threads used in the batch processin (Default = 1). + """ + +@@ -533,7 +666,7 @@ class SentencePieceProcessor(object): + if type(input[0][0]) is str: + return self._DecodePiecesBatch(input, num_threads) + +- if out_type == 'proto': ++ if out_type == 'serialized_proto': + if type(input) is int: + return self._DecodeIdsAsSerializedProto([input]) + if type(input) is str: +@@ -552,6 +685,25 @@ class SentencePieceProcessor(object): + return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + + ++ if out_type == 'immutable_proto': ++ if type(input) is int: ++ return self._DecodeIdsAsImmutableProto([input]) ++ if type(input) is str: ++ return self._DecodePiecesAsImmutableProto([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIdsAsImmutableProto(input) ++ if type(input[0]) is str: ++ return self._DecodePiecesAsImmutableProto(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsAsImmutableProtoBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesAsImmutableProtoBatch(input, num_threads) ++ ++ + raise RuntimeError('unknown output or input type') + return None + +@@ -564,24 +716,32 @@ class SentencePieceProcessor(object): + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): ++ def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): ++ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def CalculateEntropy(self, input, theta, num_threads=None): ++ def CalculateEntropy(self, input, alpha, num_threads=None): + """Calculate sentence entropy""" + if type(input) is list: + if num_threads is None: + num_threads = self._num_threads + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') +- return self._CalculateEntropyBatch(input, theta, num_threads) ++ return self._CalculateEntropyBatch(input, alpha, num_threads) + +- return self._CalculateEntropy(input, theta) ++ return self._CalculateEntropy(input, alpha) + + + def piece_size(self): +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 40373ce..1e2e1e0 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -166,7 +166,17 @@ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, +- "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); ++ } ++} ++ ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ sentencepiece::ImmutableSentencePieceText *proto, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++ if (add_bos || add_eos || reverse || emit_unk_piece) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kUnimplemented, ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); + } + } + +@@ -216,7 +226,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ +- InitNumThreads(ins, &num_threads); \ ++ InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ +@@ -237,7 +247,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + #define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ +- InitNumThreads(ins, &num_threads); \ ++ InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ +@@ -264,6 +274,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + } + ++%apply unsigned int { uint32_t } ++ + %ignore sentencepiece::util::Status; + %ignore sentencepiece::util::StatusCode; + %ignore absl::string_view; +@@ -272,32 +284,48 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %ignore sentencepiece::NormalizerSpec; + %ignore sentencepiece::TrainerSpec; + %ignore sentencepiece::SentencePieceProcessor::status; ++%ignore sentencepiece::ImmutableSentencePieceText::mutable_proto; ++%ignore sentencepiece::ImmutableSentencePieceText::pieces() const; ++%ignore sentencepiece::ImmutableNBestSentencePieceText::mutable_proto; ++%ignore sentencepiece::ImmutableNBestSentencePieceText::nbests() const; + + %ignore sentencepiece::SentencePieceProcessor::Encode; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncode; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncode; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; ++%ignore sentencepiece::SentencePieceProcessor::Decode; ++ + %ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces; + %ignore sentencepiece::SentencePieceProcessor::EncodeAsIds; +-%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; +-%ignore sentencepiece::SentencePieceProcessor::SampleEncode; + %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds; + %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces; +-%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; +-%ignore sentencepiece::SentencePieceProcessor::NBestEncode; +-%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; + %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds; +-%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; +-%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; +- +-%ignore sentencepiece::SentencePieceProcessor::Decode; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsIds; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsPieces; + %ignore sentencepiece::SentencePieceProcessor::DecodeIds; + %ignore sentencepiece::SentencePieceProcessor::DecodePieces; ++ ++%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto; + ++%ignore sentencepiece::SentencePieceProcessor::EncodeAsImmutableProto; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsImmutableProto; ++%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsImmutableProto; ++%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsImmutableProto; ++%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsImmutableProto; ++%ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsImmutableProto; ++ + %ignore sentencepiece::SentencePieceProcessor::model_proto; + %ignore sentencepiece::SentencePieceProcessor::Load; + %ignore sentencepiece::SentencePieceProcessor::LoadOrDie; + %ignore sentencepiece::pretokenizer::PretokenizerForTrainingInterface; + %ignore sentencepiece::SentenceIterator; ++%ignore sentencepiece::ConvertToUnicodeSpans; + %ignore sentencepiece::SentencePieceTrainer::Train; + %ignore sentencepiece::SentencePieceTrainer::GetNormalizerSpec; + %ignore sentencepiece::SentencePieceTrainer::PopulateNormalizerSpec; +@@ -351,6 +379,19 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return proto; + } + ++ sentencepiece::ImmutableSentencePieceText ++ _EncodeAsImmutableProto(absl::string_view text, ++ bool enable_sampling, ++ int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ auto proto = enable_sampling ? ++ $self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : ++ $self->EncodeAsImmutableProto(text); ++ RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece); ++ return proto; ++ } ++ + ///////////////////////////////////////////////////////////////////////////// + // EncodeAs* (Batch request) + std::vector> _EncodeAsIdsBatch( +@@ -381,6 +422,17 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + sentencepiece::util::bytes); + } + ++ std::vector ++ _EncodeAsImmutableProtoBatch( ++ const std::vector &ins, int num_threads, ++ bool enable_sampling, int nbest_size, float alpha, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto, ++ absl::string_view, ++ sentencepiece::ImmutableSentencePieceText); ++ } ++ + ///////////////////////////////////////////////////////////////////////////// + // DecodeAs* (Single request) + std::string _DecodeIds(const std::vector &ids) const { +@@ -404,6 +456,18 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return $self->DecodePiecesAsSerializedProto(pieces); + } + ++ sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto( ++ const std::vector &ids) const { ++ CheckIds(ids, $self->GetPieceSize()); ++ return $self->DecodeIdsAsImmutableProto(ids); ++ } ++ ++ sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto( ++ const std::vector &pieces) const { ++ CheckIds(pieces, $self->GetPieceSize()); ++ return $self->DecodePiecesAsImmutableProto(pieces); ++ } ++ + ///////////////////////////////////////////////////////////////////////////// + // DecodeAs* (Batch request) + std::vector _DecodeIdsBatch( +@@ -428,6 +492,13 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + sentencepiece::util::bytes); + } + ++ std::vector ++ _DecodePiecesAsImmutableProtoBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string, ++ sentencepiece::ImmutableSentencePieceText); ++ } ++ + //////////////////////////////////////////////////////////////////////////// + // NBestEncodeAs* (Single request) + std::vector> +@@ -454,25 +525,37 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return piecess; + } + +- sentencepiece::util::bytes _NBestEncodeAsSerializedProto(absl::string_view text, +- int nbest_size, +- bool add_bos, bool add_eos, bool reverse, +- bool emit_unk_piece) const { ++ sentencepiece::util::bytes ++ _NBestEncodeAsSerializedProto(absl::string_view text, ++ int nbest_size, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return $self->NBestEncodeAsSerializedProto(text, nbest_size); + } + ++ sentencepiece::ImmutableNBestSentencePieceText ++ _NBestEncodeAsImmutableProto(absl::string_view text, ++ int nbest_size, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ RewriteIds(*$self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return $self->NBestEncodeAsImmutableProto(text, nbest_size); ++ } ++ ++ + ///////////////////////////////////////////////////////////////////////////// + // SampleEncodeAndScoreAs* (Single request) + std::vector, float>> + _SampleEncodeAndScoreAsIds(absl::string_view text, +- int num_samples, float theta, bool wor, ++ int num_samples, float alpha, bool wor, + bool include_best, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + for (auto &ids : idss) { + RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); + } +@@ -481,25 +564,50 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + std::vector, float>> + _SampleEncodeAndScoreAsPieces(absl::string_view text, +- int num_samples, float theta, bool wor, ++ int num_samples, float alpha, bool wor, + bool include_best, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + for (auto &pieces : piecess) { + RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; + } + ++ sentencepiece::util::bytes ++ _SampleEncodeAndScoreAsSerializedProto(absl::string_view text, ++ int num_samples, float alpha, bool wor, ++ bool include_best, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ RewriteIds(*$self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return $self->SampleEncodeAndScoreAsSerializedProto(text, num_samples, ++ alpha, wor, include_best); ++ } ++ ++ sentencepiece::ImmutableNBestSentencePieceText ++ _SampleEncodeAndScoreAsImmutableProto(absl::string_view text, ++ int num_samples, float alpha, bool wor, ++ bool include_best, ++ bool add_bos, bool add_eos, bool reverse, ++ bool emit_unk_piece) const { ++ RewriteIds(*$self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, ++ alpha, wor, include_best); ++ } ++ ++ + // Calculate Entropy +- float _CalculateEntropy(absl::string_view text, float theta) { +- return $self->CalculateEntropy(text, theta); ++ float _CalculateEntropy(absl::string_view text, float alpha) { ++ return $self->CalculateEntropy(text, alpha); + } + + std::vector _CalculateEntropyBatch(const std::vector &ins, +- float theta, int num_threads) { ++ float alpha, int num_threads) { + std::vector outs(ins.size()); + InitNumThreads(ins, &num_threads); + { +@@ -507,7 +615,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + for (int n = 0; n < num_threads; ++n) { + pool.Schedule([&, n]() { + for (size_t i = n; i < ins.size(); i += num_threads) { +- outs[i] = self->CalculateEntropy(ins[i], theta); ++ outs[i] = self->CalculateEntropy(ins[i], alpha); + } + }); + } +@@ -634,9 +742,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + if out_type is str: + return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) + + if out_type is int: + return self._EncodeAsIds(input, enable_sampling, nbest_size, +@@ -644,9 +755,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + if out_type is str: + return self._EncodeAsPieces(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size, ++ alpha, add_bos, add_eos, reverse, emit_unk_piece) + + raise RuntimeError('unknown out_type={}'.format(out_type)) + return None +@@ -661,7 +775,11 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + + def EncodeAsSerializedProto(self, input, **kwargs): +- return self.Encode(input=input, out_type='proto', **kwargs) ++ return self.Encode(input=input, out_type='serialized_proto', **kwargs) ++ ++ ++ def EncodeAsImmutableProto(self, input, **kwargs): ++ return self.Encode(input=input, out_type='immutable_proto', **kwargs) + + + def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): +@@ -676,7 +794,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, +- out_type='proto', enable_sampling=True, **kwargs) ++ out_type='serialized_proto', enable_sampling=True, **kwargs) ++ ++ ++ def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs): ++ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, ++ out_type='immutable_proto', enable_sampling=True, **kwargs) + + + def NBestEncode(self, +@@ -722,9 +845,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + if out_type is str: + return self._NBestEncodeAsPieces(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) +- if out_type == 'proto': ++ if out_type == 'serialized_proto' or out_type == 'proto': + return self._NBestEncodeAsSerializedProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) ++ if out_type == 'immutable_proto': ++ return self._NBestEncodeAsImmutableProto(text, nbest_size, ++ add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: + return [_encode(n) for n in input] +@@ -744,7 +870,12 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, +- out_type='proto', **kwargs) ++ out_type='serialized_proto', **kwargs) ++ ++ ++ def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs): ++ return self.NBestEncode(input=input, nbest_size=nbest_size, ++ out_type='immutable_proto', **kwargs) + + + def SampleEncodeAndScore(self, +@@ -755,20 +886,20 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + reverse=None, + emit_unk_piece=None, + num_samples=None, +- theta=None, ++ alpha=None, + wor=None, + include_best=None): + """SampleEncodeAndScore text input to segmented ids or tokens. + + Args: + input: input string. accepsts list of string. +- out_type: output type. int or str or 'proto'. ++ out_type: output type. int or str or 'serialized_proto' or 'immutable_proto' + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) + num_samples: How many samples to return (Default = 1) +- theta: inverse temperature for sampling ++ alpha: inverse temperature for sampling + wor: whether to sample without replacement (Default = false) + include_best: whether to include the best tokenization, requires wor=True (Default = false) + """ +@@ -785,8 +916,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + emit_unk_piece = self._emit_unk_piece + if num_samples is None: + num_samples = 1 +- if theta is None: +- theta = 1. ++ if alpha is None: ++ alpha = 1. + if wor is None: + wor = False + if include_best is None: +@@ -801,10 +932,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + def _encode(text): + if out_type is int: +- return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, ++ return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + else: +- return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, ++ return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + + if type(input) is list: +@@ -817,7 +948,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + """Decode processed id or token sequences. + + Args: +- out_type: output type. str or 'proto' (Default = str) ++ out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) + num_threads: the number of threads used in the batch processin (Default = 1). + """ + +@@ -848,7 +979,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + if type(input[0][0]) is str: + return self._DecodePiecesBatch(input, num_threads) + +- if out_type == 'proto': ++ if out_type == 'serialized_proto': + if type(input) is int: + return self._DecodeIdsAsSerializedProto([input]) + if type(input) is str: +@@ -867,6 +998,25 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + + ++ if out_type == 'immutable_proto': ++ if type(input) is int: ++ return self._DecodeIdsAsImmutableProto([input]) ++ if type(input) is str: ++ return self._DecodePiecesAsImmutableProto([input]) ++ ++ if type(input) is list: ++ if len(input) == 0 or type(input[0]) is int: ++ return self._DecodeIdsAsImmutableProto(input) ++ if type(input[0]) is str: ++ return self._DecodePiecesAsImmutableProto(input) ++ ++ if type(input[0]) is list: ++ if len(input[0]) == 0 or type(input[0][0]) is int: ++ return self._DecodeIdsAsImmutableProtoBatch(input, num_threads) ++ if type(input[0][0]) is str: ++ return self._DecodePiecesAsImmutableProtoBatch(input, num_threads) ++ ++ + raise RuntimeError('unknown output or input type') + return None + +@@ -879,24 +1029,32 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): ++ def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs): ++ return self.Decode(input=input, out_type=out_type, **kwargs) ++ ++ ++ def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): ++ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + +- def CalculateEntropy(self, input, theta, num_threads=None): ++ def CalculateEntropy(self, input, alpha, num_threads=None): + """Calculate sentence entropy""" + if type(input) is list: + if num_threads is None: + num_threads = self._num_threads + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') +- return self._CalculateEntropyBatch(input, theta, num_threads) ++ return self._CalculateEntropyBatch(input, alpha, num_threads) + +- return self._CalculateEntropy(input, theta) ++ return self._CalculateEntropy(input, alpha) + + + def piece_size(self): +@@ -1028,6 +1186,50 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + } + ++%extend sentencepiece::ImmutableSentencePieceText { ++ ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const { ++ if (index < 0 || index >= static_cast($self->pieces_size())) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "piece index is out of range."); ++ } ++ return $self->pieces(index); ++ } ++ ++%pythoncode { ++ def __len__(self): ++ return self.pieces_size() ++ ++ def __getitem__(self, i): ++ return self.pieces(i) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() ++} ++} ++ ++%extend sentencepiece::ImmutableNBestSentencePieceText { ++ ImmutableSentencePieceText nbests(int index) const { ++ if (index < 0 || index >= static_cast($self->nbests_size())) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "nbest index is out of range."); ++ } ++ return $self->nbests(index); ++ } ++ ++%pythoncode { ++ def __len__(self): ++ return self.nbests_size() ++ ++ def __getitem__(self, i): ++ return self.nbests(i) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() ++} ++} ++ + %typemap(out) std::vector { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { +@@ -1277,6 +1479,14 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + } + ++%typemap(out) std::vector { ++ $result = PyList_New($1.size()); ++ for (size_t i = 0; i < $1.size(); ++i) { ++ PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText($1.at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); ++ PyList_SET_ITEM($result, i, obj); ++ } ++} ++ + %typemap(in) sentencepiece::SentenceIterator * { + sentencepiece::SentenceIterator *out = nullptr; + if (PyIter_Check($input)) { +@@ -1324,6 +1534,18 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + delete $1; + } + ++%typemap(freearg) sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece { ++ delete $1; ++} ++ ++%typemap(freearg) sentencepiece::ImmutableSentencePieceText { ++ delete $1; ++} ++ ++%typemap(freearg) sentencepiece::ImmutableNBestSentencePieceText { ++ delete $1; ++} ++ + %include + %include + +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 36ce38c..9776b0f 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -2694,17 +2694,20 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO + + #define SWIGTYPE_p_char swig_types[0] + #define SWIGTYPE_p_float swig_types[1] +-#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[2] +-#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[3] +-#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[4] +-#define SWIGTYPE_p_std__string swig_types[5] +-#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[6] +-#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[7] +-#define SWIGTYPE_p_std__vectorT_int_t swig_types[8] +-#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[9] +-#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[10] +-static swig_type_info *swig_types[12]; +-static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; ++#define SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText swig_types[2] ++#define SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText swig_types[3] ++#define SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece swig_types[4] ++#define SWIGTYPE_p_sentencepiece__SentenceIterator swig_types[5] ++#define SWIGTYPE_p_sentencepiece__SentencePieceProcessor swig_types[6] ++#define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[7] ++#define SWIGTYPE_p_std__string swig_types[8] ++#define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[9] ++#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[10] ++#define SWIGTYPE_p_std__vectorT_int_t swig_types[11] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_absl__string_view_t_t swig_types[12] ++#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[13] ++static swig_type_info *swig_types[15]; ++static swig_module_info swig_module = {swig_types, 14, 0, 0, 0, 0}; + #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) + #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) + +@@ -2972,7 +2975,17 @@ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, +- "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); ++ } ++} ++ ++inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, ++ sentencepiece::ImmutableSentencePieceText *proto, ++ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { ++ if (add_bos || add_eos || reverse || emit_unk_piece) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kUnimplemented, ++ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API"); + } + } + +@@ -3022,7 +3035,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ +- InitNumThreads(ins, &num_threads); \ ++ InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ +@@ -3043,7 +3056,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + #define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ +- InitNumThreads(ins, &num_threads); \ ++ InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ +@@ -3060,131 +3073,24 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } // namespace + + +-SWIGINTERN swig_type_info* +-SWIG_pchar_descriptor(void) ++SWIGINTERNINLINE PyObject* ++ SWIG_From_unsigned_SS_int (unsigned int value) + { +- static int init = 0; +- static swig_type_info* info = 0; +- if (!init) { +- info = SWIG_TypeQuery("_p_char"); +- init = 1; +- } +- return info; ++ return PyInt_FromSize_t((size_t) value); + } + + +-SWIGINTERN int +-SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) +-{ +-#if PY_VERSION_HEX>=0x03000000 +-#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +- if (PyBytes_Check(obj)) +-#else +- if (PyUnicode_Check(obj)) +-#endif +-#else +- if (PyString_Check(obj)) +-#endif +- { +- char *cstr; Py_ssize_t len; +- int ret = SWIG_OK; +-#if PY_VERSION_HEX>=0x03000000 +-#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +- if (!alloc && cptr) { +- /* We can't allow converting without allocation, since the internal +- representation of string in Python 3 is UCS-2/UCS-4 but we require +- a UTF-8 representation. +- TODO(bhy) More detailed explanation */ +- return SWIG_RuntimeError; +- } +- obj = PyUnicode_AsUTF8String(obj); +- if (!obj) +- return SWIG_TypeError; +- if (alloc) +- *alloc = SWIG_NEWOBJ; +-#endif +- if (PyBytes_AsStringAndSize(obj, &cstr, &len) == -1) +- return SWIG_TypeError; +-#else +- if (PyString_AsStringAndSize(obj, &cstr, &len) == -1) +- return SWIG_TypeError; +-#endif +- if (cptr) { +- if (alloc) { +- if (*alloc == SWIG_NEWOBJ) { +- *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); +- *alloc = SWIG_NEWOBJ; +- } else { +- *cptr = cstr; +- *alloc = SWIG_OLDOBJ; +- } +- } else { +-#if PY_VERSION_HEX>=0x03000000 +-#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +- *cptr = PyBytes_AsString(obj); +-#else +- assert(0); /* Should never reach here with Unicode strings in Python 3 */ +-#endif +-#else +- *cptr = SWIG_Python_str_AsChar(obj); +- if (!*cptr) +- ret = SWIG_TypeError; +-#endif +- } +- } +- if (psize) *psize = len + 1; +-#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +- Py_XDECREF(obj); +-#endif +- return ret; +- } else { +-#if defined(SWIG_PYTHON_2_UNICODE) +-#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +-#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" +-#endif +-#if PY_VERSION_HEX<0x03000000 +- if (PyUnicode_Check(obj)) { +- char *cstr; Py_ssize_t len; +- if (!alloc && cptr) { +- return SWIG_RuntimeError; +- } +- obj = PyUnicode_AsUTF8String(obj); +- if (!obj) +- return SWIG_TypeError; +- if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { +- if (cptr) { +- if (alloc) *alloc = SWIG_NEWOBJ; +- *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); +- } +- if (psize) *psize = len + 1; ++ #define SWIG_From_long PyInt_FromLong + +- Py_XDECREF(obj); +- return SWIG_OK; +- } else { +- Py_XDECREF(obj); +- } +- } +-#endif +-#endif + +- swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); +- if (pchar_descriptor) { +- void* vptr = 0; +- if (SWIG_ConvertPtr(obj, &vptr, pchar_descriptor, 0) == SWIG_OK) { +- if (cptr) *cptr = (char *) vptr; +- if (psize) *psize = vptr ? (strlen((char *)vptr) + 1) : 0; +- if (alloc) *alloc = SWIG_OLDOBJ; +- return SWIG_OK; +- } +- } +- } +- return SWIG_TypeError; ++SWIGINTERNINLINE PyObject* ++SWIG_From_unsigned_SS_long (unsigned long value) ++{ ++ return (value > LONG_MAX) ? ++ PyLong_FromUnsignedLong(value) : PyInt_FromLong(static_cast< long >(value)); + } + + +- +- +- + #include + #if !defined(SWIG_NO_LLONG_MAX) + # if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__) +@@ -3195,6 +3101,47 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) + #endif + + ++#if defined(LLONG_MAX) && !defined(SWIG_LONG_LONG_AVAILABLE) ++# define SWIG_LONG_LONG_AVAILABLE ++#endif ++ ++ ++#ifdef SWIG_LONG_LONG_AVAILABLE ++SWIGINTERNINLINE PyObject* ++SWIG_From_unsigned_SS_long_SS_long (unsigned long long value) ++{ ++ return (value > LONG_MAX) ? ++ PyLong_FromUnsignedLongLong(value) : PyInt_FromLong(static_cast< long >(value)); ++} ++#endif ++ ++ ++SWIGINTERNINLINE PyObject * ++SWIG_From_size_t (size_t value) ++{ ++#ifdef SWIG_LONG_LONG_AVAILABLE ++ if (sizeof(size_t) <= sizeof(unsigned long)) { ++#endif ++ return SWIG_From_unsigned_SS_long (static_cast< unsigned long >(value)); ++#ifdef SWIG_LONG_LONG_AVAILABLE ++ } else { ++ /* assume sizeof(size_t) <= sizeof(unsigned long long) */ ++ return SWIG_From_unsigned_SS_long_SS_long (static_cast< unsigned long long >(value)); ++ } ++#endif ++} ++ ++ ++ #define SWIG_From_double PyFloat_FromDouble ++ ++ ++SWIGINTERNINLINE PyObject * ++SWIG_From_float (float value) ++{ ++ return SWIG_From_double (value); ++} ++ ++ + SWIGINTERN int + SWIG_AsVal_double (PyObject *obj, double *val) + { +@@ -3335,98 +3282,215 @@ SWIG_AsVal_int (PyObject * obj, int *val) + return res; + } + +- +-/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */ +-#ifndef SWIG_isfinite +-/* isfinite() is a macro for C99 */ +-# if defined(isfinite) +-# define SWIG_isfinite(X) (isfinite(X)) +-# elif defined(__cplusplus) && __cplusplus >= 201103L +-/* Use a template so that this works whether isfinite() is std::isfinite() or +- * in the global namespace. The reality seems to vary between compiler +- * versions. +- * +- * Make sure namespace std exists to avoid compiler warnings. +- * +- * extern "C++" is required as this fragment can end up inside an extern "C" { } block +- */ +-namespace std { } +-extern "C++" template +-inline int SWIG_isfinite_func(T x) { +- using namespace std; +- return isfinite(x); +-} +-# define SWIG_isfinite(X) (SWIG_isfinite_func(X)) +-# elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +-# define SWIG_isfinite(X) (__builtin_isfinite(X)) +-# elif defined(__clang__) && defined(__has_builtin) +-# if __has_builtin(__builtin_isfinite) +-# define SWIG_isfinite(X) (__builtin_isfinite(X)) +-# endif +-# elif defined(_MSC_VER) +-# define SWIG_isfinite(X) (_finite(X)) +-# elif defined(__sun) && defined(__SVR4) +-# include +-# define SWIG_isfinite(X) (finite(X)) +-# endif +-#endif +- +- +-/* Accept infinite as a valid float value unless we are unable to check if a value is finite */ +-#ifdef SWIG_isfinite +-# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX) && SWIG_isfinite(X)) +-#else +-# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX)) +-#endif +- +- +-SWIGINTERN int +-SWIG_AsVal_float (PyObject * obj, float *val) +-{ +- double v; +- int res = SWIG_AsVal_double (obj, &v); +- if (SWIG_IsOK(res)) { +- if (SWIG_Float_Overflow_Check(v)) { +- return SWIG_OverflowError; +- } else { +- if (val) *val = static_cast< float >(v); ++SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sentencepiece_ImmutableSentencePieceText_pieces(sentencepiece::ImmutableSentencePieceText const *self,int index){ ++ if (index < 0 || index >= static_cast(self->pieces_size())) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "piece index is out of range."); + } +- } +- return res; +-} +- ++ return self->pieces(index); ++ } ++SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_ImmutableNBestSentencePieceText_nbests(sentencepiece::ImmutableNBestSentencePieceText const *self,int index){ ++ if (index < 0 || index >= static_cast(self->nbests_size())) { ++ throw sentencepiece::util::Status( ++ sentencepiece::util::StatusCode::kOutOfRange, ++ "nbest index is out of range."); ++ } ++ return self->nbests(index); ++ } + +-SWIGINTERN int +-SWIG_AsVal_bool (PyObject *obj, bool *val) ++SWIGINTERN swig_type_info* ++SWIG_pchar_descriptor(void) + { +- int r; +- if (!PyBool_Check(obj)) +- return SWIG_ERROR; +- r = PyObject_IsTrue(obj); +- if (r == -1) +- return SWIG_ERROR; +- if (val) *val = r ? true : false; +- return SWIG_OK; +-} +- +- +- #define SWIG_From_double PyFloat_FromDouble +- +- +-SWIGINTERNINLINE PyObject * +-SWIG_From_float (float value) +-{ +- return SWIG_From_double (value); ++ static int init = 0; ++ static swig_type_info* info = 0; ++ if (!init) { ++ info = SWIG_TypeQuery("_p_char"); ++ init = 1; ++ } ++ return info; + } + + +-SWIGINTERNINLINE PyObject* +- SWIG_From_int (int value) ++SWIGINTERN int ++SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) + { +- return PyInt_FromLong((long) value); +-} +- +- ++#if PY_VERSION_HEX>=0x03000000 ++#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) ++ if (PyBytes_Check(obj)) ++#else ++ if (PyUnicode_Check(obj)) ++#endif ++#else ++ if (PyString_Check(obj)) ++#endif ++ { ++ char *cstr; Py_ssize_t len; ++ int ret = SWIG_OK; ++#if PY_VERSION_HEX>=0x03000000 ++#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) ++ if (!alloc && cptr) { ++ /* We can't allow converting without allocation, since the internal ++ representation of string in Python 3 is UCS-2/UCS-4 but we require ++ a UTF-8 representation. ++ TODO(bhy) More detailed explanation */ ++ return SWIG_RuntimeError; ++ } ++ obj = PyUnicode_AsUTF8String(obj); ++ if (!obj) ++ return SWIG_TypeError; ++ if (alloc) ++ *alloc = SWIG_NEWOBJ; ++#endif ++ if (PyBytes_AsStringAndSize(obj, &cstr, &len) == -1) ++ return SWIG_TypeError; ++#else ++ if (PyString_AsStringAndSize(obj, &cstr, &len) == -1) ++ return SWIG_TypeError; ++#endif ++ if (cptr) { ++ if (alloc) { ++ if (*alloc == SWIG_NEWOBJ) { ++ *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); ++ *alloc = SWIG_NEWOBJ; ++ } else { ++ *cptr = cstr; ++ *alloc = SWIG_OLDOBJ; ++ } ++ } else { ++#if PY_VERSION_HEX>=0x03000000 ++#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) ++ *cptr = PyBytes_AsString(obj); ++#else ++ assert(0); /* Should never reach here with Unicode strings in Python 3 */ ++#endif ++#else ++ *cptr = SWIG_Python_str_AsChar(obj); ++ if (!*cptr) ++ ret = SWIG_TypeError; ++#endif ++ } ++ } ++ if (psize) *psize = len + 1; ++#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) ++ Py_XDECREF(obj); ++#endif ++ return ret; ++ } else { ++#if defined(SWIG_PYTHON_2_UNICODE) ++#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) ++#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" ++#endif ++#if PY_VERSION_HEX<0x03000000 ++ if (PyUnicode_Check(obj)) { ++ char *cstr; Py_ssize_t len; ++ if (!alloc && cptr) { ++ return SWIG_RuntimeError; ++ } ++ obj = PyUnicode_AsUTF8String(obj); ++ if (!obj) ++ return SWIG_TypeError; ++ if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { ++ if (cptr) { ++ if (alloc) *alloc = SWIG_NEWOBJ; ++ *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); ++ } ++ if (psize) *psize = len + 1; ++ ++ Py_XDECREF(obj); ++ return SWIG_OK; ++ } else { ++ Py_XDECREF(obj); ++ } ++ } ++#endif ++#endif ++ ++ swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); ++ if (pchar_descriptor) { ++ void* vptr = 0; ++ if (SWIG_ConvertPtr(obj, &vptr, pchar_descriptor, 0) == SWIG_OK) { ++ if (cptr) *cptr = (char *) vptr; ++ if (psize) *psize = vptr ? (strlen((char *)vptr) + 1) : 0; ++ if (alloc) *alloc = SWIG_OLDOBJ; ++ return SWIG_OK; ++ } ++ } ++ } ++ return SWIG_TypeError; ++} ++ ++ ++ ++ ++ ++/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */ ++#ifndef SWIG_isfinite ++/* isfinite() is a macro for C99 */ ++# if defined(isfinite) ++# define SWIG_isfinite(X) (isfinite(X)) ++# elif defined(__cplusplus) && __cplusplus >= 201103L ++/* Use a template so that this works whether isfinite() is std::isfinite() or ++ * in the global namespace. The reality seems to vary between compiler ++ * versions. ++ * ++ * Make sure namespace std exists to avoid compiler warnings. ++ * ++ * extern "C++" is required as this fragment can end up inside an extern "C" { } block ++ */ ++namespace std { } ++extern "C++" template ++inline int SWIG_isfinite_func(T x) { ++ using namespace std; ++ return isfinite(x); ++} ++# define SWIG_isfinite(X) (SWIG_isfinite_func(X)) ++# elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) ++# define SWIG_isfinite(X) (__builtin_isfinite(X)) ++# elif defined(__clang__) && defined(__has_builtin) ++# if __has_builtin(__builtin_isfinite) ++# define SWIG_isfinite(X) (__builtin_isfinite(X)) ++# endif ++# elif defined(_MSC_VER) ++# define SWIG_isfinite(X) (_finite(X)) ++# elif defined(__sun) && defined(__SVR4) ++# include ++# define SWIG_isfinite(X) (finite(X)) ++# endif ++#endif ++ ++ ++/* Accept infinite as a valid float value unless we are unable to check if a value is finite */ ++#ifdef SWIG_isfinite ++# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX) && SWIG_isfinite(X)) ++#else ++# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX)) ++#endif ++ ++ ++SWIGINTERN int ++SWIG_AsVal_float (PyObject * obj, float *val) ++{ ++ double v; ++ int res = SWIG_AsVal_double (obj, &v); ++ if (SWIG_IsOK(res)) { ++ if (SWIG_Float_Overflow_Check(v)) { ++ return SWIG_OverflowError; ++ } else { ++ if (val) *val = static_cast< float >(v); ++ } ++ } ++ return res; ++} ++ ++ ++SWIGINTERNINLINE PyObject* ++ SWIG_From_int (int value) ++{ ++ return PyInt_FromLong((long) value); ++} ++ ++ + SWIGINTERNINLINE PyObject* + SWIG_From_bool (bool value) + { +@@ -3436,6 +3500,20 @@ SWIGINTERNINLINE PyObject* + SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){ + return self->Load(arg); + } ++ ++SWIGINTERN int ++SWIG_AsVal_bool (PyObject *obj, bool *val) ++{ ++ int r; ++ if (!PyBool_Check(obj)) ++ return SWIG_ERROR; ++ r = PyObject_IsTrue(obj); ++ if (r == -1) ++ return SWIG_ERROR; ++ if (val) *val = r ? true : false; ++ return SWIG_OK; ++} ++ + SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto ids = enable_sampling ? + self->SampleEncodeAsIds(text, nbest_size, alpha) : +@@ -3457,6 +3535,13 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Enco + RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } ++SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__EncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ auto proto = enable_sampling ? ++ self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : ++ self->EncodeAsImmutableProto(text); ++ RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); ++ return proto; ++ } + SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, + absl::string_view, std::vector); +@@ -3470,6 +3555,11 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__EncodeAsSerializedPr + absl::string_view, + sentencepiece::util::bytes); + } ++SWIGINTERN std::vector< sentencepiece::ImmutableSentencePieceText > sentencepiece_SentencePieceProcessor__EncodeAsImmutableProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto, ++ absl::string_view, ++ sentencepiece::ImmutableSentencePieceText); ++ } + SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIds(ids); +@@ -3485,6 +3575,14 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Deco + CheckIds(pieces, self->GetPieceSize()); + return self->DecodePiecesAsSerializedProto(pieces); + } ++SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ ++ CheckIds(ids, self->GetPieceSize()); ++ return self->DecodeIdsAsImmutableProto(ids); ++ } ++SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ ++ CheckIds(pieces, self->GetPieceSize()); ++ return self->DecodePiecesAsImmutableProto(pieces); ++ } + SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); + } +@@ -3499,6 +3597,10 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerial + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, + sentencepiece::util::bytes); + } ++SWIGINTERN std::vector< sentencepiece::ImmutableSentencePieceText > sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string, ++ sentencepiece::ImmutableSentencePieceText); ++ } + SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto idss = self->NBestEncodeAsIds(text, nbest_size); + for (auto &ids : idss) { +@@ -3518,26 +3620,43 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__NBes + add_bos, add_eos, reverse, emit_unk_piece); + return self->NBestEncodeAsSerializedProto(text, nbest_size); + } +-SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ RewriteIds(*self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return self->NBestEncodeAsImmutableProto(text, nbest_size); ++ } ++SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + for (auto &ids : idss) { + RewriteIds(*self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return idss; + } +-SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto piecess = self->SampleEncodeAndScoreAsPieces(text, num_samples, +- theta, wor, include_best); ++ alpha, wor, include_best); + for (auto &pieces : piecess) { + RewriteIds(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + } + return piecess; + } +-SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float theta){ +- return self->CalculateEntropy(text, theta); ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ RewriteIds(*self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return self->SampleEncodeAndScoreAsSerializedProto(text, num_samples, ++ alpha, wor, include_best); ++ } ++SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ ++ RewriteIds(*self, static_cast(nullptr), ++ add_bos, add_eos, reverse, emit_unk_piece); ++ return self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, ++ alpha, wor, include_best); ++ } ++SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float alpha){ ++ return self->CalculateEntropy(text, alpha); + } +-SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float theta,int num_threads){ ++SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float alpha,int num_threads){ + std::vector outs(ins.size()); + InitNumThreads(ins, &num_threads); + { +@@ -3545,7 +3664,7 @@ SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateE + for (int n = 0; n < num_threads; ++n) { + pool.Schedule([&, n]() { + for (size_t i = n; i < ins.size(); i += num_threads) { +- outs[i] = self->CalculateEntropy(ins[i], theta); ++ outs[i] = self->CalculateEntropy(ins[i], alpha); + } + }); + } +@@ -3596,56 +3715,672 @@ SWIG_AsVal_unsigned_SS_long (PyObject *obj, unsigned long *val) + } + } + } +-#endif +- return SWIG_TypeError; ++#endif ++ return SWIG_TypeError; ++} ++ ++ ++SWIGINTERN int ++SWIG_AsVal_unsigned_SS_int (PyObject * obj, unsigned int *val) ++{ ++ unsigned long v; ++ int res = SWIG_AsVal_unsigned_SS_long (obj, &v); ++ if (SWIG_IsOK(res)) { ++ if ((v > UINT_MAX)) { ++ return SWIG_OverflowError; ++ } else { ++ if (val) *val = static_cast< unsigned int >(v); ++ } ++ } ++ return res; ++} ++ ++SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromString(absl::string_view arg){ ++ const auto _status = sentencepiece::SentencePieceTrainer::Train(arg); ++ if (!_status.ok()) throw _status; ++ return; ++ } ++SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap(std::unordered_map< std::string,std::string > const &args){ ++ const auto _status = sentencepiece::SentencePieceTrainer::Train(args); ++ if (!_status.ok()) throw _status; ++ return; ++ } ++SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap2(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ ++ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter); ++ if (!_status.ok()) throw _status; ++ return; ++ } ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap3(std::unordered_map< std::string,std::string > const &args){ ++ sentencepiece::util::bytes model_proto; ++ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto); ++ if (!_status.ok()) throw _status; ++ return model_proto; ++ } ++SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap4(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ ++ sentencepiece::util::bytes model_proto; ++ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto); ++ if (!_status.ok()) throw _status; ++ return model_proto; ++ } ++#ifdef __cplusplus ++extern "C" { ++#endif ++SWIGINTERN PyObject *_wrap_new_ImmutableSentencePieceText_ImmutableSentencePiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *result = 0 ; ++ ++ if (!SWIG_Python_UnpackTuple(args, "new_ImmutableSentencePieceText_ImmutableSentencePiece", 0, 0, 0)) SWIG_fail; ++ { ++ try { ++ result = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *)new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_NEW | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_delete_ImmutableSentencePieceText_ImmutableSentencePiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_DISOWN | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableSentencePieceText_ImmutableSentencePiece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ delete arg1; ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_Py_Void(); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ std::string *result = 0 ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_piece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ result = (std::string *) &((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->piece(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(*result, input_type); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ std::string *result = 0 ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_surface" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ result = (std::string *) &((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->surface(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(*result, input_type); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ uint32_t result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_id" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->id(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ uint32_t result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_begin" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->begin(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ uint32_t result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_end" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *)arg1)->end(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_unsigned_SS_int(static_cast< unsigned int >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *obj; ++ if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; ++ SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_NewClientData(obj)); ++ return SWIG_Py_Void(); ++} ++ ++SWIGINTERN PyObject *ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ return SWIG_Python_InitShadowInstance(args); ++} ++ ++SWIGINTERN PyObject *_wrap_new_ImmutableSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *result = 0 ; ++ ++ if (!SWIG_Python_UnpackTuple(args, "new_ImmutableSentencePieceText", 0, 0, 0)) SWIG_fail; ++ { ++ try { ++ result = (sentencepiece::ImmutableSentencePieceText *)new sentencepiece::ImmutableSentencePieceText(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_NEW | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_delete_ImmutableSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_DISOWN | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableSentencePieceText" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ { ++ try { ++ delete arg1; ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_Py_Void(); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ size_t result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces_size" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText const *)arg1)->pieces_size(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_size_t(static_cast< size_t >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_text(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ std::string *result = 0 ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_text" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ { ++ try { ++ result = (std::string *) &((sentencepiece::ImmutableSentencePieceText const *)arg1)->text(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ PyObject *input_type = resultobj; ++ resultobj = MakePyOutputString(*result, input_type); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_score(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ float result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_score" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ { ++ try { ++ result = (float)((sentencepiece::ImmutableSentencePieceText const *)arg1)->score(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_float(static_cast< float >(result)); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_SerializeAsString(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ sentencepiece::util::bytes result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_SerializeAsString" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText const *)arg1)->SerializeAsString(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = MakePyOutputBytes(result); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ int arg2 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText_pieces", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); ++ { ++ try { ++ result = sentencepiece_ImmutableSentencePieceText_pieces((sentencepiece::ImmutableSentencePieceText const *)arg1,arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(static_cast< const sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *ImmutableSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *obj; ++ if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; ++ SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_NewClientData(obj)); ++ return SWIG_Py_Void(); ++} ++ ++SWIGINTERN PyObject *ImmutableSentencePieceText_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ return SWIG_Python_InitShadowInstance(args); ++} ++ ++SWIGINTERN PyObject *_wrap_new_ImmutableNBestSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *result = 0 ; ++ ++ if (!SWIG_Python_UnpackTuple(args, "new_ImmutableNBestSentencePieceText", 0, 0, 0)) SWIG_fail; ++ { ++ try { ++ result = (sentencepiece::ImmutableNBestSentencePieceText *)new sentencepiece::ImmutableNBestSentencePieceText(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_NEW | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_delete_ImmutableNBestSentencePieceText(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_DISOWN | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_ImmutableNBestSentencePieceText" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); ++ { ++ try { ++ delete arg1; ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_Py_Void(); ++ return resultobj; ++fail: ++ return NULL; + } + + +-SWIGINTERN int +-SWIG_AsVal_unsigned_SS_int (PyObject * obj, unsigned int *val) +-{ +- unsigned long v; +- int res = SWIG_AsVal_unsigned_SS_long (obj, &v); +- if (SWIG_IsOK(res)) { +- if ((v > UINT_MAX)) { +- return SWIG_OverflowError; +- } else { +- if (val) *val = static_cast< unsigned int >(v); ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ size_t result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests_size" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableNBestSentencePieceText const *)arg1)->nbests_size(); ++ ReleaseResultObject(resultobj); + } +- } +- return res; ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_From_size_t(static_cast< size_t >(result)); ++ return resultobj; ++fail: ++ return NULL; + } + +-SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromString(absl::string_view arg){ +- const auto _status = sentencepiece::SentencePieceTrainer::Train(arg); +- if (!_status.ok()) throw _status; +- return; ++ ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_SerializeAsString(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[1] ; ++ sentencepiece::util::bytes result; ++ ++ if (!args) SWIG_fail; ++ swig_obj[0] = args; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_SerializeAsString" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } +-SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap(std::unordered_map< std::string,std::string > const &args){ +- const auto _status = sentencepiece::SentencePieceTrainer::Train(args); +- if (!_status.ok()) throw _status; +- return; ++ arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); ++ { ++ try { ++ result = ((sentencepiece::ImmutableNBestSentencePieceText const *)arg1)->SerializeAsString(); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } + } +-SWIGINTERN void sentencepiece_SentencePieceTrainer__TrainFromMap2(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ +- const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter); +- if (!_status.ok()) throw _status; +- return; ++ { ++ resultobj = MakePyOutputBytes(result); + } +-SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap3(std::unordered_map< std::string,std::string > const &args){ +- sentencepiece::util::bytes model_proto; +- const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto); +- if (!_status.ok()) throw _status; +- return model_proto; ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; ++ int arg2 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText_nbests", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } +-SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceTrainer__TrainFromMap4(std::unordered_map< std::string,std::string > const &args,sentencepiece::SentenceIterator *iter){ +- sentencepiece::util::bytes model_proto; +- const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto); +- if (!_status.ok()) throw _status; +- return model_proto; ++ arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); ++ { ++ try { ++ result = sentencepiece_ImmutableNBestSentencePieceText_nbests((sentencepiece::ImmutableNBestSentencePieceText const *)arg1,arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } + } +-#ifdef __cplusplus +-extern "C" { +-#endif ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *ImmutableNBestSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *obj; ++ if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; ++ SWIG_TypeNewClientData(SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_NewClientData(obj)); ++ return SWIG_Py_Void(); ++} ++ ++SWIGINTERN PyObject *ImmutableNBestSentencePieceText_swiginit(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ return SWIG_Python_InitShadowInstance(args); ++} ++ + SWIGINTERN PyObject *_wrap_new_SentencePieceProcessor(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *result = 0 ; +@@ -3992,165 +4727,16 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObj + float *arg4 = (float *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; +- float val3 ; +- int ecode3 = 0 ; +- void *argp4 = 0 ; +- int res4 = 0 ; +- sentencepiece::util::Status result; +- +- if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; +- res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); +- if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); +- } +- arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = ustring.str(); +- } +- ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); +- } +- arg3 = static_cast< float >(val3); +- res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); +- if (!SWIG_IsOK(res4)) { +- SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); +- } +- arg4 = reinterpret_cast< float * >(argp4); +- { +- try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); +- ReleaseResultObject(resultobj); +- } +- catch (const sentencepiece::util::Status &status) { +- SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); +- } +- } +- { +- if (!(&result)->ok()) { +- SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); +- } +- resultobj = SWIG_From_bool((&result)->ok()); +- } +- return resultobj; +-fail: +- return NULL; +-} +- +- +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +- PyObject *resultobj = 0; +- sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; +- bool arg5 ; +- bool arg6 ; +- void *argp1 = 0 ; +- int res1 = 0 ; +- int val3 ; +- int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- bool val5 ; +- int ecode5 = 0 ; +- bool val6 ; +- int ecode6 = 0 ; +- PyObject *swig_obj[6] ; +- std::vector< std::pair< std::vector< std::string >,float > > result; +- +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; +- res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); +- if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); +- } +- arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); +- { +- const PyInputString ustring(swig_obj[1]); +- if (!ustring.IsAvalable()) { +- PyErr_SetString(PyExc_TypeError, "not a string"); +- SWIG_fail; +- } +- resultobj = ustring.input_type(); +- arg2 = ustring.str(); +- } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); +- if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); +- ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); +- if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); +- } +- arg5 = static_cast< bool >(val5); +- ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); +- if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); +- } +- arg6 = static_cast< bool >(val6); +- { +- try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); +- ReleaseResultObject(resultobj); +- } +- catch (const sentencepiece::util::Status &status) { +- SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); +- } +- } +- { +- PyObject *input_type = resultobj; +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].first.size()); +- for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SET_ITEM(obj, j, MakePyOutputString(result[i].first[j], input_type)); +- } +- PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); +- } +- } +- return resultobj; +-fail: +- return NULL; +-} +- +- +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +- PyObject *resultobj = 0; +- sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +- absl::string_view arg2 ; +- int arg3 ; +- float arg4 ; +- bool arg5 ; +- bool arg6 ; +- void *argp1 = 0 ; +- int res1 = 0 ; +- int val3 ; ++ float val3 ; + int ecode3 = 0 ; +- float val4 ; +- int ecode4 = 0 ; +- bool val5 ; +- int ecode5 = 0 ; +- bool val6 ; +- int ecode6 = 0 ; +- PyObject *swig_obj[6] ; +- std::vector< std::pair< std::vector< int >,float > > result; ++ void *argp4 = 0 ; ++ int res4 = 0 ; ++ sentencepiece::util::Status result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; ++ if ((nobjs < 4) || (nobjs > 4)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -4162,29 +4748,19 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb + resultobj = ustring.input_type(); + arg2 = ustring.str(); + } +- ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); +- } +- arg3 = static_cast< int >(val3); +- ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); +- if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); +- } +- arg4 = static_cast< float >(val4); +- ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); +- if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); +- } +- arg5 = static_cast< bool >(val5); +- ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); +- if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); + } +- arg6 = static_cast< bool >(val6); ++ arg3 = static_cast< float >(val3); ++ res4 = SWIG_ConvertPtr(swig_obj[3], &argp4,SWIGTYPE_p_float, 0 | 0 ); ++ if (!SWIG_IsOK(res4)) { ++ SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "4"" of type '" "float *""'"); ++ } ++ arg4 = reinterpret_cast< float * >(argp4); + { + try { +- result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); ++ result = ((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3,arg4); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -4192,14 +4768,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyOb + } + } + { +- resultobj = PyList_New((&result)->size()); +- for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].first.size()); +- for (size_t j = 0; j < result[i].first.size(); ++j) { +- PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); +- } +- PyList_SET_ITEM(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); ++ if (!(&result)->ok()) { ++ SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); + } ++ resultobj = SWIG_From_bool((&result)->ok()); + } + return resultobj; + fail: +@@ -5112,15 +5684,242 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObj + } + } + { +- resultobj = MakePyOutputBytes(result); ++ resultobj = MakePyOutputBytes(result); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ bool arg3 ; ++ int arg4 ; ++ float arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ bool val3 ; ++ int ecode3 = 0 ; ++ int val4 ; ++ int ecode4 = 0 ; ++ float val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ PyObject *swig_obj[9] ; ++ sentencepiece::ImmutableSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsImmutableProto", 9, 9, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = ustring.str(); ++ } ++ ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "3"" of type '" "bool""'"); ++ } ++ arg3 = static_cast< bool >(val3); ++ ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "4"" of type '" "int""'"); ++ } ++ arg4 = static_cast< int >(val4); ++ ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "5"" of type '" "float""'"); ++ } ++ arg5 = static_cast< float >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsImmutableProto" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__EncodeAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ int arg3 ; ++ bool arg4 ; ++ int arg5 ; ++ float arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ bool val4 ; ++ int ecode4 = 0 ; ++ int val5 ; ++ int ecode5 = 0 ; ++ float val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ std::vector< std::vector< int > > result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = ustring.str(); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); ++ } ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); ++ } ++ arg5 = static_cast< int >(val5); ++ ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); ++ } ++ arg6 = static_cast< float >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyObject *obj = PyList_New(result[i].size()); ++ for (size_t j = 0; j < result[i].size(); ++j) { ++ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ } ++ PyList_SET_ITEM(resultobj, i, obj); ++ } ++ } ++ { ++ delete arg2; + } + return resultobj; + fail: ++ { ++ delete arg2; ++ } + return NULL; + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; +@@ -5151,12 +5950,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; +- std::vector< std::vector< int > > result; ++ std::vector< std::vector< std::string > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5182,47 +5981,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { +- SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { +- SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { +- SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { +- result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5230,11 +6029,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SW + } + } + { ++ PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast(result[i][j]))); ++ PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); + } + PyList_SET_ITEM(resultobj, i, obj); + } +@@ -5251,7 +6051,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; +@@ -5282,12 +6082,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; +- std::vector< std::vector< std::string > > result; ++ BytesArray result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5313,47 +6113,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { +- SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { +- SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { +- SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { +- result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5361,14 +6161,9 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject + } + } + { +- PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyObject *obj = PyList_New(result[i].size()); +- for (size_t j = 0; j < result[i].size(); ++j) { +- PyList_SET_ITEM(obj, j, MakePyOutputString(result[i][j], input_type)); +- } +- PyList_SET_ITEM(resultobj, i, obj); ++ PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); + } + } + { +@@ -5383,7 +6178,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsImmutableProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; +@@ -5414,12 +6209,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; +- BytesArray result; ++ SwigValueWrapper< std::vector< sentencepiece::ImmutableSentencePieceText > > result; + +- if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsImmutableProtoBatch", 10, 10, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { +@@ -5445,47 +6240,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { +- SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { +- SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { +- SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { +- SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { +- SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { +- SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { +- SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { +- SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsImmutableProtoBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); + { + try { +- result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ result = sentencepiece_SentencePieceProcessor__EncodeAsImmutableProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -5495,7 +6290,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch( + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); ++ PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText((&result)->at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + { +@@ -5750,6 +6546,121 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< int > *arg2 = 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsImmutableProto", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyInt_Check(o)) { ++ (*out)[i] = static_cast(PyInt_AsLong(o)); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ { ++ delete arg2; ++ } ++ return resultobj; ++fail: ++ { ++ delete arg2; ++ } ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< absl::string_view > *arg2 = 0 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsImmutableProto", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector(size); ++ for (size_t i = 0; i < size; ++i) { ++ const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); ++ if (ustring.IsAvalable()) { ++ (*out)[i] = ustring.str(); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ { ++ delete arg2; ++ } ++ return resultobj; ++fail: ++ { ++ delete arg2; ++ } ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -6043,7 +6954,82 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + arg3 = static_cast< int >(val3); + { + try { +- result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); ++ } ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< std::vector< absl::string_view > > *arg2 = 0 ; ++ int arg3 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ SwigValueWrapper< std::vector< sentencepiece::ImmutableSentencePieceText > > result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch", 3, 3, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector> *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ const PyInputString ustring(PyList_GetItem(o, j)); ++ if (ustring.IsAvalable()) { ++ (*out)[i][j] = ustring.str(); ++ } else { ++ PyErr_SetString(PyExc_TypeError,"list must contain integers"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< absl::string_view > > const &)*arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -6053,7 +7039,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { +- PyList_SET_ITEM(resultobj, i, MakePyOutputBytes(result[i])); ++ PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText((&result)->at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); ++ PyList_SET_ITEM(resultobj, i, obj); + } + } + return resultobj; +@@ -6323,6 +7310,86 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ int arg3 ; ++ bool arg4 ; ++ bool arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ bool val4 ; ++ int ecode4 = 0 ; ++ bool val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ PyObject *swig_obj[7] ; ++ sentencepiece::ImmutableNBestSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsImmutableProto", 7, 7, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = ustring.str(); ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "4"" of type '" "bool""'"); ++ } ++ arg4 = static_cast< bool >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "5"" of type '" "bool""'"); ++ } ++ arg5 = static_cast< bool >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsImmutableProto" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableNBestSentencePieceText(static_cast< const sentencepiece::ImmutableNBestSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -6550,6 +7617,216 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ int arg3 ; ++ float arg4 ; ++ bool arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ float val4 ; ++ int ecode4 = 0 ; ++ bool val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ sentencepiece::util::bytes result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto", 10, 10, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = ustring.str(); ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "4"" of type '" "float""'"); ++ } ++ arg4 = static_cast< float >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "5"" of type '" "bool""'"); ++ } ++ arg5 = static_cast< bool >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = MakePyOutputBytes(result); ++ } ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ absl::string_view arg2 ; ++ int arg3 ; ++ float arg4 ; ++ bool arg5 ; ++ bool arg6 ; ++ bool arg7 ; ++ bool arg8 ; ++ bool arg9 ; ++ bool arg10 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ float val4 ; ++ int ecode4 = 0 ; ++ bool val5 ; ++ int ecode5 = 0 ; ++ bool val6 ; ++ int ecode6 = 0 ; ++ bool val7 ; ++ int ecode7 = 0 ; ++ bool val8 ; ++ int ecode8 = 0 ; ++ bool val9 ; ++ int ecode9 = 0 ; ++ bool val10 ; ++ int ecode10 = 0 ; ++ PyObject *swig_obj[10] ; ++ sentencepiece::ImmutableNBestSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto", 10, 10, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ const PyInputString ustring(swig_obj[1]); ++ if (!ustring.IsAvalable()) { ++ PyErr_SetString(PyExc_TypeError, "not a string"); ++ SWIG_fail; ++ } ++ resultobj = ustring.input_type(); ++ arg2 = ustring.str(); ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); ++ if (!SWIG_IsOK(ecode4)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "4"" of type '" "float""'"); ++ } ++ arg4 = static_cast< float >(val4); ++ ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); ++ if (!SWIG_IsOK(ecode5)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "5"" of type '" "bool""'"); ++ } ++ arg5 = static_cast< bool >(val5); ++ ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); ++ if (!SWIG_IsOK(ecode6)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "6"" of type '" "bool""'"); ++ } ++ arg6 = static_cast< bool >(val6); ++ ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); ++ if (!SWIG_IsOK(ecode7)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "7"" of type '" "bool""'"); ++ } ++ arg7 = static_cast< bool >(val7); ++ ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); ++ if (!SWIG_IsOK(ecode8)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "8"" of type '" "bool""'"); ++ } ++ arg8 = static_cast< bool >(val8); ++ ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); ++ if (!SWIG_IsOK(ecode9)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "9"" of type '" "bool""'"); ++ } ++ arg9 = static_cast< bool >(val9); ++ ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); ++ if (!SWIG_IsOK(ecode10)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto" "', argument " "10"" of type '" "bool""'"); ++ } ++ arg10 = static_cast< bool >(val10); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableNBestSentencePieceText(static_cast< const sentencepiece::ImmutableNBestSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -7009,6 +8286,31 @@ SWIGINTERN PyObject *SentencePieceTrainer_swigregister(PyObject *SWIGUNUSEDPARM( + + static PyMethodDef SwigMethods[] = { + { "SWIG_PyInstanceMethod_New", SWIG_PyInstanceMethod_New, METH_O, NULL}, ++ { "new_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_new_ImmutableSentencePieceText_ImmutableSentencePiece, METH_NOARGS, NULL}, ++ { "delete_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_delete_ImmutableSentencePieceText_ImmutableSentencePiece, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_piece", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_surface", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_id", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_begin", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_end", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_swigregister", ImmutableSentencePieceText_ImmutableSentencePiece_swigregister, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece_swiginit", ImmutableSentencePieceText_ImmutableSentencePiece_swiginit, METH_VARARGS, NULL}, ++ { "new_ImmutableSentencePieceText", _wrap_new_ImmutableSentencePieceText, METH_NOARGS, NULL}, ++ { "delete_ImmutableSentencePieceText", _wrap_delete_ImmutableSentencePieceText, METH_O, NULL}, ++ { "ImmutableSentencePieceText_pieces_size", _wrap_ImmutableSentencePieceText_pieces_size, METH_O, NULL}, ++ { "ImmutableSentencePieceText_text", _wrap_ImmutableSentencePieceText_text, METH_O, NULL}, ++ { "ImmutableSentencePieceText_score", _wrap_ImmutableSentencePieceText_score, METH_O, NULL}, ++ { "ImmutableSentencePieceText_SerializeAsString", _wrap_ImmutableSentencePieceText_SerializeAsString, METH_O, NULL}, ++ { "ImmutableSentencePieceText_pieces", _wrap_ImmutableSentencePieceText_pieces, METH_VARARGS, NULL}, ++ { "ImmutableSentencePieceText_swigregister", ImmutableSentencePieceText_swigregister, METH_O, NULL}, ++ { "ImmutableSentencePieceText_swiginit", ImmutableSentencePieceText_swiginit, METH_VARARGS, NULL}, ++ { "new_ImmutableNBestSentencePieceText", _wrap_new_ImmutableNBestSentencePieceText, METH_NOARGS, NULL}, ++ { "delete_ImmutableNBestSentencePieceText", _wrap_delete_ImmutableNBestSentencePieceText, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText_nbests_size", _wrap_ImmutableNBestSentencePieceText_nbests_size, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText_nbests", _wrap_ImmutableNBestSentencePieceText_nbests, METH_VARARGS, NULL}, ++ { "ImmutableNBestSentencePieceText_swigregister", ImmutableNBestSentencePieceText_swigregister, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText_swiginit", ImmutableNBestSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_SentencePieceProcessor", _wrap_new_SentencePieceProcessor, METH_NOARGS, NULL}, + { "delete_SentencePieceProcessor", _wrap_delete_SentencePieceProcessor, METH_O, NULL}, + { "SentencePieceProcessor_LoadFromSerializedProto", _wrap_SentencePieceProcessor_LoadFromSerializedProto, METH_VARARGS, NULL}, +@@ -7017,8 +8319,6 @@ static PyMethodDef SwigMethods[] = { + { "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL}, + { "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL}, + { "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, +- { "SentencePieceProcessor_SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL}, + { "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL}, + { "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL}, +@@ -7037,22 +8337,30 @@ static PyMethodDef SwigMethods[] = { + { "SentencePieceProcessor__EncodeAsIds", _wrap_SentencePieceProcessor__EncodeAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsPieces", _wrap_SentencePieceProcessor__EncodeAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsSerializedProto", _wrap_SentencePieceProcessor__EncodeAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsImmutableProto", _wrap_SentencePieceProcessor__EncodeAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsIdsBatch", _wrap_SentencePieceProcessor__EncodeAsIdsBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsPiecesBatch", _wrap_SentencePieceProcessor__EncodeAsPiecesBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsSerializedProtoBatch", _wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__EncodeAsImmutableProtoBatch", _wrap_SentencePieceProcessor__EncodeAsImmutableProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIds", _wrap_SentencePieceProcessor__DecodeIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePieces", _wrap_SentencePieceProcessor__DecodePieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsSerializedProto", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIdsAsImmutableProto", _wrap_SentencePieceProcessor__DecodeIdsAsImmutableProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePiecesAsImmutableProto", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsBatch", _wrap_SentencePieceProcessor__DecodeIdsBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesBatch", _wrap_SentencePieceProcessor__DecodePiecesBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsIds", _wrap_SentencePieceProcessor__NBestEncodeAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsPieces", _wrap_SentencePieceProcessor__NBestEncodeAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__NBestEncodeAsImmutableProto", _wrap_SentencePieceProcessor__NBestEncodeAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__CalculateEntropy", _wrap_SentencePieceProcessor__CalculateEntropy, METH_VARARGS, NULL}, + { "SentencePieceProcessor__CalculateEntropyBatch", _wrap_SentencePieceProcessor__CalculateEntropyBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor_swigregister", SentencePieceProcessor_swigregister, METH_O, NULL}, +@@ -7076,6 +8384,9 @@ static PyMethodDef SwigMethods_proxydocs[] = { + + static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_sentencepiece__ImmutableNBestSentencePieceText = {"_p_sentencepiece__ImmutableNBestSentencePieceText", "sentencepiece::ImmutableNBestSentencePieceText *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_sentencepiece__ImmutableSentencePieceText = {"_p_sentencepiece__ImmutableSentencePieceText", "sentencepiece::ImmutableSentencePieceText *", 0, 0, (void*)0, 0}; ++static swig_type_info _swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece = {"_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece", "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentenceIterator = {"_p_sentencepiece__SentenceIterator", "sentencepiece::SentenceIterator *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sentencepiece__SentencePieceProcessor", "sentencepiece::SentencePieceProcessor *", 0, 0, (void*)0, 0}; + static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; +@@ -7089,6 +8400,9 @@ static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__ve + static swig_type_info *swig_type_initial[] = { + &_swigt__p_char, + &_swigt__p_float, ++ &_swigt__p_sentencepiece__ImmutableNBestSentencePieceText, ++ &_swigt__p_sentencepiece__ImmutableSentencePieceText, ++ &_swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, + &_swigt__p_sentencepiece__SentenceIterator, + &_swigt__p_sentencepiece__SentencePieceProcessor, + &_swigt__p_sentencepiece__SentencePieceTrainer, +@@ -7102,6 +8416,9 @@ static swig_type_info *swig_type_initial[] = { + + static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_float[] = { {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_sentencepiece__ImmutableNBestSentencePieceText[] = { {&_swigt__p_sentencepiece__ImmutableNBestSentencePieceText, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_sentencepiece__ImmutableSentencePieceText[] = { {&_swigt__p_sentencepiece__ImmutableSentencePieceText, 0, 0, 0},{0, 0, 0, 0}}; ++static swig_cast_info _swigc__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece[] = { {&_swigt__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentenceIterator[] = { {&_swigt__p_sentencepiece__SentenceIterator, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_swigt__p_sentencepiece__SentencePieceProcessor, 0, 0, 0},{0, 0, 0, 0}}; + static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; +@@ -7115,6 +8432,9 @@ static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swig + static swig_cast_info *swig_cast_initial[] = { + _swigc__p_char, + _swigc__p_float, ++ _swigc__p_sentencepiece__ImmutableNBestSentencePieceText, ++ _swigc__p_sentencepiece__ImmutableSentencePieceText, ++ _swigc__p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, + _swigc__p_sentencepiece__SentenceIterator, + _swigc__p_sentencepiece__SentencePieceProcessor, + _swigc__p_sentencepiece__SentencePieceTrainer, +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 6c48bcd..2f2c84a 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -287,16 +287,44 @@ class TestSentencepieceProcessor(unittest.TestCase): + ids2 = self.sp_.EncodeAsIds(text2) + pieces = self.sp_.EncodeAsPieces(text) + pieces2 = self.sp_.EncodeAsPieces(text2) +- protos = self.sp_.EncodeAsSerializedProto(text) +- proto2 = self.sp_.EncodeAsSerializedProto(text2) ++ sprotos = self.sp_.EncodeAsSerializedProto(text) ++ sproto2 = self.sp_.EncodeAsSerializedProto(text2) ++ iprotos = self.sp_.EncodeAsImmutableProto(text) ++ iprotos2 = self.sp_.EncodeAsImmutableProto(text2) + + self.assertEqual(sp.encode(text, out_type=int), ids) + self.assertEqual(sp.encode(text, out_type=str), pieces) +- self.assertEqual(sp.encode(text, out_type='proto'), protos) ++ self.assertEqual(sp.encode(text, out_type='serialized_proto'), sprotos) ++ self.assertEqual(sp.encode(text, out_type='immutable_proto'), iprotos) + + self.assertEqual(sp.encode([text], out_type=int), [ids]) + self.assertEqual(sp.encode([text], out_type=str), [pieces]) +- self.assertEqual(sp.encode([text], out_type='proto'), [protos]) ++ self.assertEqual(sp.encode([text], out_type='serialized_proto'), [sprotos]) ++ self.assertEqual(sp.encode([text], out_type='immutable_proto'), [iprotos]) ++ ++ self.assertEqual(len(iprotos), len(pieces)) ++ self.assertEqual(len(iprotos), len(ids)) ++ self.assertEqual(iprotos.text(), text) ++ ++ self.assertEqual(len(iprotos2), len(pieces2)) ++ self.assertEqual(len(iprotos2), len(ids2)) ++ self.assertEqual(iprotos2.text(), text2) ++ ++ for i in range(len(iprotos)): ++ self.assertEqual(ids[i], iprotos.pieces(i).id()) ++ self.assertEqual(pieces[i], iprotos.pieces(i).piece()) ++ ++ for i, piece in enumerate(iprotos): ++ self.assertEqual(ids[i], piece.id()) ++ self.assertEqual(pieces[i], piece.piece()) ++ ++ for i in range(len(iprotos2)): ++ self.assertEqual(ids2[i], iprotos2.pieces(i).id()) ++ self.assertEqual(pieces2[i], iprotos2.pieces(i).piece()) ++ ++ for i, piece in enumerate(iprotos2): ++ self.assertEqual(ids2[i], piece.id()) ++ self.assertEqual(pieces2[i], piece.piece()) + + detok_ids = self.sp_.DecodeIds(ids) + detok_pieces = self.sp_.DecodePieces(pieces) +@@ -464,19 +492,29 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual(d1, d4) + self.assertEqual(d1, d5) + +- r1 = sp.encode(texts, out_type='proto', num_threads=None) +- r2 = sp.encode(texts, out_type='proto', num_threads=1) +- r3 = sp.encode(texts, out_type='proto', num_threads=-1) +- r4 = sp.encode(texts, out_type='proto', num_threads=8) +- r5 = [sp.encode(s, out_type='proto') for s in texts] ++ r1 = sp.encode(texts, out_type='serialized_proto', num_threads=None) ++ r2 = sp.encode(texts, out_type='serialized_proto', num_threads=1) ++ r3 = sp.encode(texts, out_type='serialized_proto', num_threads=-1) ++ r4 = sp.encode(texts, out_type='serialized_proto', num_threads=8) ++ r5 = [sp.encode(s, out_type='serialized_proto') for s in texts] ++ self.assertEqual(r1, r2) ++ self.assertEqual(r1, r3) ++ self.assertEqual(r1, r4) ++ self.assertEqual(r1, r5) ++ ++ r1 = sp.encode(texts, out_type='immutable_proto', num_threads=None) ++ r2 = sp.encode(texts, out_type='immutable_proto', num_threads=1) ++ r3 = sp.encode(texts, out_type='immutable_proto', num_threads=-1) ++ r4 = sp.encode(texts, out_type='immutable_proto', num_threads=8) ++ r5 = [sp.encode(s, out_type='immutable_proto') for s in texts] + self.assertEqual(r1, r2) + self.assertEqual(r1, r3) + self.assertEqual(r1, r4) + self.assertEqual(r1, r5) + +- e1 = sp.calculate_entropy(texts, theta=1.0, num_threads=10) +- e2 = sp.CalculateEntropy(texts, theta=1.0, num_threads=10) +- e3 = [sp.calculate_entropy(s, theta=1.0) for s in texts] ++ e1 = sp.calculate_entropy(texts, alpha=1.0, num_threads=10) ++ e2 = sp.CalculateEntropy(texts, alpha=1.0, num_threads=10) ++ e3 = [sp.calculate_entropy(s, alpha=1.0) for s in texts] + self.assertEqual(e1, e2) + self.assertEqual(e1, e3) + +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index 805e0f9..482a45b 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -54,65 +54,70 @@ std::vector ToPieceArray(const std::vector &v) { + for (int i = 0; i < v.size(); ++i) out[i] = v[i]; + return out; + } ++ + } // namespace + +-ImmutableSentencePieceText::ImmutableSentencePieceText() {} +-ImmutableSentencePieceText::~ImmutableSentencePieceText() {} ++ImmutableSentencePieceText::ImmutableSentencePieceText() ++ : spt_(&SentencePieceText::default_instance()) {} + + ImmutableSentencePieceText::ImmutableSentencePieceText( + const SentencePieceText &spt) + : spt_(&spt) {} + +-ImmutableSentencePieceText::ImmutableSentencePiece::ImmutableSentencePiece( +- const SentencePieceText_SentencePiece &sp) ++ImmutableSentencePieceText::~ImmutableSentencePieceText() {} ++ ++ImmutableSentencePieceText_ImmutableSentencePiece:: ++ ImmutableSentencePieceText_ImmutableSentencePiece() ++ : sp_(&SentencePieceText_SentencePiece::default_instance()) {} ++ ++ImmutableSentencePieceText_ImmutableSentencePiece:: ++ ImmutableSentencePieceText_ImmutableSentencePiece( ++ const SentencePieceText_SentencePiece &sp) + : sp_(&sp) {} + +-const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::piece() ++const std::string &ImmutableSentencePieceText_ImmutableSentencePiece::piece() + const { + return sp_->piece(); + } + +-const std::string &ImmutableSentencePieceText::ImmutableSentencePiece::surface() ++const std::string &ImmutableSentencePieceText_ImmutableSentencePiece::surface() + const { + return sp_->surface(); + } + +-uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::id() const { ++uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::id() const { + return sp_->id(); + } + +-uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::begin() const { ++uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::begin() const { + return sp_->begin(); + } + +-uint32_t ImmutableSentencePieceText::ImmutableSentencePiece::end() const { ++uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::end() const { + return sp_->end(); + } + +-std::vector ++std::vector + ImmutableSentencePieceText::pieces() const { +- std::vector pieces; +- if (spt_ == nullptr) return pieces; +- pieces.reserve(spt_->pieces_size()); ++ std::vector pieces( ++ spt_->pieces_size()); + for (int i = 0; i < spt_->pieces_size(); ++i) +- pieces[i] = ImmutableSentencePiece(spt_->pieces(i)); ++ pieces[i] = ++ ImmutableSentencePieceText_ImmutableSentencePiece(spt_->pieces(i)); + return pieces; + } + + size_t ImmutableSentencePieceText::pieces_size() const { +- return spt_ ? spt_->pieces_size() : 0; ++ return spt_->pieces_size(); + } + +-ImmutableSentencePieceText::ImmutableSentencePiece ++ImmutableSentencePieceText_ImmutableSentencePiece + ImmutableSentencePieceText::pieces(int index) const { +- return ImmutableSentencePieceText::ImmutableSentencePiece( +- spt_->pieces(index)); ++ return ImmutableSentencePieceText_ImmutableSentencePiece(spt_->pieces(index)); + } + + const std::string &ImmutableSentencePieceText::text() const { +- if (spt_) return spt_->text(); +- static std::string *kEmptyString = new std::string(); +- return *kEmptyString; ++ return spt_->text(); + } + + float ImmutableSentencePieceText::score() const { +@@ -127,8 +132,8 @@ SentencePieceText *ImmutableSentencePieceText::mutable_proto() { + return rep_.get(); + } + +-std::string ImmutableSentencePieceText::SerializeAsString() const { +- return spt_ ? spt_->SerializeAsString() : ""; ++util::bytes ImmutableSentencePieceText::SerializeAsString() const { ++ return spt_->SerializeAsString(); + } + + ImmutableNBestSentencePieceText::ImmutableNBestSentencePieceText() {} +@@ -145,9 +150,8 @@ ImmutableSentencePieceText ImmutableNBestSentencePieceText::nbests( + + std::vector + ImmutableNBestSentencePieceText::nbests() const { +- std::vector nbests; +- if (rep_ == nullptr) return nbests; +- nbests.reserve(rep_->nbests_size()); ++ if (rep_ == nullptr) return {}; ++ std::vector nbests(rep_->nbests_size()); + for (int i = 0; i < rep_->nbests_size(); ++i) + nbests[i] = ImmutableSentencePieceText(rep_->nbests(i)); + return nbests; +@@ -160,7 +164,7 @@ NBestSentencePieceText *ImmutableNBestSentencePieceText::mutable_proto() { + return rep_.get(); + } + +-std::string ImmutableNBestSentencePieceText::SerializeAsString() const { ++util::bytes ImmutableNBestSentencePieceText::SerializeAsString() const { + return rep_ ? rep_->SerializeAsString() : ""; + } + +@@ -1044,8 +1048,35 @@ std::string SentencePieceProcessor::serialized_model_proto() const { + // std::random_device. + void SetRandomGeneratorSeed(unsigned int seed); + +-namespace io { ++void ConvertToUnicodeSpans(SentencePieceText *spt) { ++ if (spt == nullptr) return; ++ ++ std::vector utf8_to_unicode(spt->text().size() + 1, 0); ++ absl::string_view str = spt->text(); ++ size_t prev = 0; ++ int ulen = 0; ++ while (!str.empty()) { ++ const size_t mblen = string_util::OneCharLen(str.data()); ++ for (int i = prev; i < prev + mblen; ++i) { ++ utf8_to_unicode[i] = ulen; ++ } ++ ++ulen; ++ prev += mblen; ++ str.remove_prefix(mblen); ++ } ++ utf8_to_unicode[prev] = ulen; ++ ++ auto clip = [&](int s) { ++ return std::min(std::max(0, s), utf8_to_unicode.size() - 1); ++ }; + ++ for (auto &piece : *(spt->mutable_pieces())) { ++ piece.set_begin(utf8_to_unicode[clip(piece.begin())]); ++ piece.set_end(utf8_to_unicode[clip(piece.end())]); ++ } ++} ++ ++namespace io { + util::Status LoadModelProto(absl::string_view filename, + ModelProto *model_proto) { + if (filename.empty()) { +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index 8124c59..b7fae6a 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -157,35 +157,39 @@ class SentencePieceText_SentencePiece; + // This wrapper only allows an immutable access to the proto and + // hides the actual implementation of protobuf. + // See sentencepiece.proto for the details of this class. ++class ImmutableSentencePieceText_ImmutableSentencePiece { ++ public: ++ ImmutableSentencePieceText_ImmutableSentencePiece(); ++ ~ImmutableSentencePieceText_ImmutableSentencePiece() = default; ++ ++ const std::string &piece() const; ++ const std::string &surface() const; ++ uint32_t id() const; ++ uint32_t begin() const; ++ uint32_t end() const; ++ ++ friend class ImmutableSentencePieceText; ++ ++ private: ++ explicit ImmutableSentencePieceText_ImmutableSentencePiece( ++ const SentencePieceText_SentencePiece &sp); ++ const SentencePieceText_SentencePiece *sp_ = nullptr; ++}; ++ + class ImmutableSentencePieceText { + public: + ImmutableSentencePieceText(); + virtual ~ImmutableSentencePieceText(); + +- class ImmutableSentencePiece { +- public: +- ~ImmutableSentencePiece() = default; +- const std::string &piece() const; +- const std::string &surface() const; +- uint32_t id() const; +- uint32_t begin() const; +- uint32_t end() const; ++ std::vector pieces() const; + +- friend class ImmutableSentencePieceText; +- +- private: +- ImmutableSentencePiece() = default; +- explicit ImmutableSentencePiece(const SentencePieceText_SentencePiece &sp); +- const SentencePieceText_SentencePiece *sp_ = nullptr; +- }; +- +- std::vector pieces() const; + size_t pieces_size() const; +- ImmutableSentencePiece pieces(int index) const; ++ ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const; ++ + const std::string &text() const; + float score() const; + +- std::string SerializeAsString() const; ++ util::bytes SerializeAsString() const; + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as +@@ -214,7 +218,7 @@ class ImmutableNBestSentencePieceText { + size_t nbests_size() const; + ImmutableSentencePieceText nbests(int index) const; + +- std::string SerializeAsString() const; ++ util::bytes SerializeAsString() const; + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as +@@ -398,7 +402,7 @@ class SentencePieceProcessor { + float alpha, SentencePieceText *spt) const; + + virtual util::Status SampleEncodeAndScore( +- absl::string_view input, int samples, float alpha, bool wor, ++ absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, NBestSentencePieceText *samples_spt) const; + + // DEPRECATED: Remove this API and use std::vector +@@ -534,11 +538,11 @@ class SentencePieceProcessor { + } + + virtual util::bytes SampleEncodeAndScoreAsSerializedProto( +- absl::string_view input, int samples, float alpha, bool wor, +- bool include_best, int nbest_size) const { ++ absl::string_view input, int num_samples, float alpha, bool wor, ++ bool include_best) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore, + ImmutableNBestSentencePieceText, input, +- samples, alpha, wor, include_best); ++ num_samples, alpha, wor, include_best); + } + + // TODO(taku): Remove this API and use std::vector +@@ -579,11 +583,11 @@ class SentencePieceProcessor { + } + + virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto( +- absl::string_view input, int samples, float alpha, bool wor, +- bool include_best, int nbest_size) const { ++ absl::string_view input, int num_samples, float alpha, bool wor, ++ bool include_best) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore, + ImmutableNBestSentencePieceText, input, +- samples, alpha, wor, include_best); ++ num_samples, alpha, wor, include_best); + } + + // TODO(taku): Remove this API and use std::vector +@@ -703,6 +707,9 @@ class SentencePieceProcessor { + // std::random_device. + void SetRandomGeneratorSeed(unsigned int seed); + ++// Converts the utf8 byte spans into Unicode char span. ++void ConvertToUnicodeSpans(SentencePieceText *spt); ++ + #ifndef SWIG + // IO related functions to absorb model formats. + namespace io { +diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc +index ed651f7..ff55aeb 100644 +--- a/src/sentencepiece_processor_test.cc ++++ b/src/sentencepiece_processor_test.cc +@@ -1564,6 +1564,10 @@ TEST(SentencePieceProcessorTest, VocabularyTest) { + + TEST(SentencePieceProcessorTest, ImmutableSentencePieceTextTest) { + ImmutableSentencePieceText spt; ++ EXPECT_TRUE(spt.text().empty()); ++ EXPECT_EQ(spt.score(), 0.0); ++ EXPECT_TRUE(spt.SerializeAsString().empty()); ++ + auto *v = spt.mutable_proto(); + + v->set_text("hello world"); +@@ -1586,52 +1590,123 @@ TEST(SentencePieceProcessorTest, ImmutableSentencePieceTextTest) { + EXPECT_EQ(v->pieces(i).end(), spt.pieces(i).end()); + } + +- int n = 0; +- for (auto &p : spt.pieces()) { +- EXPECT_EQ(v->pieces(n).surface(), p.surface()); +- EXPECT_EQ(v->pieces(n).piece(), p.piece()); +- EXPECT_EQ(v->pieces(n).id(), p.id()); +- EXPECT_EQ(v->pieces(n).begin(), p.begin()); +- EXPECT_EQ(v->pieces(n).end(), p.end()); +- ++n; +- } +- +- EXPECT_EQ(v->text(), spt.text()); +- EXPECT_EQ(v->score(), spt.score()); +- EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); ++ auto check_proto = [&v](const ImmutableSentencePieceText &s) { ++ int n = 0; ++ for (auto &p : s.pieces()) { ++ EXPECT_EQ(v->pieces(n).surface(), p.surface()); ++ EXPECT_EQ(v->pieces(n).piece(), p.piece()); ++ EXPECT_EQ(v->pieces(n).id(), p.id()); ++ EXPECT_EQ(v->pieces(n).begin(), p.begin()); ++ EXPECT_EQ(v->pieces(n).end(), p.end()); ++ ++n; ++ } ++ EXPECT_EQ(v->text(), s.text()); ++ EXPECT_EQ(v->score(), s.score()); ++ EXPECT_EQ(v->SerializeAsString(), s.SerializeAsString()); ++ }; + + // test copy. +- auto spt2 = spt; +- EXPECT_EQ(spt2.pieces_size(), spt.pieces_size()); +- for (int i = 0; i < spt.pieces_size(); ++i) { +- EXPECT_EQ(spt2.pieces(i).surface(), spt.pieces(i).surface()); +- EXPECT_EQ(spt2.pieces(i).piece(), spt.pieces(i).piece()); +- EXPECT_EQ(spt2.pieces(i).id(), spt.pieces(i).id()); +- EXPECT_EQ(spt2.pieces(i).begin(), spt.pieces(i).begin()); +- EXPECT_EQ(spt2.pieces(i).end(), spt.pieces(i).end()); +- } ++ const auto spt2 = spt; ++ check_proto(spt2); ++ ++ // test assign. ++ const ImmutableSentencePieceText spt3(spt); ++ check_proto(spt3); ++ ++ // default piece. ++ const ImmutableSentencePieceText_ImmutableSentencePiece piece; ++ EXPECT_TRUE(piece.surface().empty()); ++ EXPECT_TRUE(piece.piece().empty()); ++ EXPECT_EQ(piece.begin(), 0); ++ EXPECT_EQ(piece.end(), 0); ++ EXPECT_EQ(piece.id(), 0); + } + + TEST(SentencePieceProcessorTest, ImmutableNBestSentencePieceTextTest) { + ImmutableNBestSentencePieceText spt; ++ EXPECT_EQ(spt.nbests_size(), 0); ++ EXPECT_TRUE(spt.SerializeAsString().empty()); ++ + auto *v = spt.mutable_proto(); ++ + for (int i = 0; i < 10; ++i) { + auto *p = v->add_nbests(); + p->set_text(absl::StrCat("text_", i)); + p->set_score(2.0 * i); + } + +- EXPECT_EQ(v->nbests_size(), spt.nbests_size()); +- for (int i = 0; i < v->nbests_size(); ++i) { +- EXPECT_EQ(v->nbests(i).text(), spt.nbests(i).text()); +- EXPECT_EQ(v->nbests(i).score(), spt.nbests(i).score()); +- } +- EXPECT_EQ(v->SerializeAsString(), spt.SerializeAsString()); ++ auto check_proto = [&v](const ImmutableNBestSentencePieceText &s) { ++ EXPECT_EQ(v->nbests_size(), s.nbests_size()); ++ for (int i = 0; i < v->nbests_size(); ++i) { ++ EXPECT_EQ(v->nbests(i).text(), s.nbests(i).text()); ++ EXPECT_EQ(v->nbests(i).score(), s.nbests(i).score()); ++ } ++ EXPECT_EQ(v->SerializeAsString(), s.SerializeAsString()); ++ }; ++ ++ check_proto(spt); + + // test copy. +- auto spt2 = spt; +- EXPECT_EQ(spt2.nbests_size(), spt.nbests_size()); +- EXPECT_EQ(spt2.SerializeAsString(), spt.SerializeAsString()); ++ const auto spt2 = spt; ++ check_proto(spt2); ++ ++ // test assign. ++ const ImmutableNBestSentencePieceText spt3(spt); ++ check_proto(spt3); ++} ++ ++TEST(SentencePieceProcessorTest, ConvertToUnicodeSpansTest) { ++ auto make_spt = [&](const std::vector &tokens) { ++ SentencePieceText spt; ++ int prev = 0; ++ std::string text; ++ for (const auto &tok : tokens) { ++ auto *piece = spt.add_pieces(); ++ piece->set_surface(tok); ++ piece->set_piece(tok); ++ piece->set_begin(prev); ++ piece->set_end(prev + tok.size()); ++ prev += tok.size(); ++ text += tok; ++ } ++ spt.set_text(text); ++ ConvertToUnicodeSpans(&spt); ++ return spt; ++ }; ++ ++ { ++ const auto spt = make_spt({"hello", "_world", "."}); ++ EXPECT_EQ(spt.pieces_size(), 3); ++ EXPECT_EQ(spt.pieces(0).begin(), 0); ++ EXPECT_EQ(spt.pieces(0).end(), 5); ++ EXPECT_EQ(spt.pieces(1).begin(), 5); ++ EXPECT_EQ(spt.pieces(1).end(), 11); ++ EXPECT_EQ(spt.pieces(2).begin(), 11); ++ EXPECT_EQ(spt.pieces(2).end(), 12); ++ } ++ ++ { ++ const auto spt = make_spt({"これは", "test", "です"}); ++ EXPECT_EQ(spt.pieces_size(), 3); ++ EXPECT_EQ(spt.pieces(0).begin(), 0); ++ EXPECT_EQ(spt.pieces(0).end(), 3); ++ EXPECT_EQ(spt.pieces(1).begin(), 3); ++ EXPECT_EQ(spt.pieces(1).end(), 7); ++ ++ EXPECT_EQ(spt.pieces(2).begin(), 7); ++ EXPECT_EQ(spt.pieces(2).end(), 9); ++ } ++ ++ { ++ const auto spt = make_spt({"いABは", "にほCD", "へと"}); ++ EXPECT_EQ(spt.pieces_size(), 3); ++ EXPECT_EQ(spt.pieces(0).begin(), 0); ++ EXPECT_EQ(spt.pieces(0).end(), 4); ++ EXPECT_EQ(spt.pieces(1).begin(), 4); ++ EXPECT_EQ(spt.pieces(1).end(), 8); ++ EXPECT_EQ(spt.pieces(2).begin(), 8); ++ EXPECT_EQ(spt.pieces(2).end(), 10); ++ } + } + + } // namespace sentencepiece diff --git a/patches/0013-Adds-more-unittests.patch b/patches/0013-Adds-more-unittests.patch new file mode 100644 index 0000000..37cf13e --- /dev/null +++ b/patches/0013-Adds-more-unittests.patch @@ -0,0 +1,1146 @@ +From: Taku Kudo +Date: Wed, 3 Aug 2022 02:24:53 +0900 +Subject: Adds more unittests + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/__init__.py | 48 +++- + python/src/sentencepiece/sentencepiece.i | 45 +++- + python/src/sentencepiece/sentencepiece_wrap.cxx | 213 +++++++++++++++-- + python/test/sentencepiece_test.py | 301 ++++++++++++++++-------- + src/sentencepiece_processor.cc | 67 +++--- + src/sentencepiece_processor.h | 19 +- + src/sentencepiece_processor_test.cc | 11 +- + 7 files changed, 532 insertions(+), 172 deletions(-) + +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index 69a9825..07acb94 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -98,6 +98,9 @@ class ImmutableSentencePieceText(object): + def pieces_size(self): + return _sentencepiece.ImmutableSentencePieceText_pieces_size(self) + ++ def pieces(self, index): ++ return _sentencepiece.ImmutableSentencePieceText_pieces(self, index) ++ + def text(self): + return _sentencepiece.ImmutableSentencePieceText_text(self) + +@@ -107,18 +110,24 @@ class ImmutableSentencePieceText(object): + def SerializeAsString(self): + return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self) + +- def pieces(self, index): +- return _sentencepiece.ImmutableSentencePieceText_pieces(self, index) ++ def _pieces(self, index): ++ return _sentencepiece.ImmutableSentencePieceText__pieces(self, index) ++ ++ def pieces(self, i): ++ return self._pieces(i) + + def __len__(self): + return self.pieces_size() + + def __getitem__(self, i): +- return self.pieces(i) ++ return self._pieces(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() + ++ def __hash__(self): ++ return hash(self.SerializeAsString()) ++ + + # Register ImmutableSentencePieceText in _sentencepiece: + _sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) +@@ -134,21 +143,30 @@ class ImmutableNBestSentencePieceText(object): + def nbests_size(self): + return _sentencepiece.ImmutableNBestSentencePieceText_nbests_size(self) + ++ def nbests(self, index): ++ return _sentencepiece.ImmutableNBestSentencePieceText_nbests(self, index) ++ + def SerializeAsString(self): + return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self) + +- def nbests(self, index): +- return _sentencepiece.ImmutableNBestSentencePieceText_nbests(self, index) ++ def _nbests(self, index): ++ return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index) ++ ++ def __nbests__(self, i): ++ return self._nbests(i) + + def __len__(self): + return self.nbests_size() + + def __getitem__(self, i): +- return self.nbests(i) ++ return self._nbests(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() + ++ def __hash__(self): ++ return hash(self.SerializeAsString()) ++ + + # Register ImmutableNBestSentencePieceText in _sentencepiece: + _sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) +@@ -272,6 +290,9 @@ class SentencePieceProcessor(object): + def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads) + ++ def _DecodeIdsAsImmutableProtoBatch(self, ins, num_threads): ++ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(self, ins, num_threads) ++ + def _DecodePiecesBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads) + +@@ -539,6 +560,8 @@ class SentencePieceProcessor(object): + return self._NBestEncodeAsImmutableProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + ++ raise RuntimeError('unknown out_type') ++ + if type(input) is list: + return [_encode(n) for n in input] + +@@ -621,10 +644,21 @@ class SentencePieceProcessor(object): + if out_type is int: + return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) +- else: ++ if out_type is str: + return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + ++ if out_type == 'serialized_proto' or out_type == 'proto': ++ return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ ++ if out_type == 'immutable_proto': ++ return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ ++ raise RuntimeError('unknown output type') ++ ++ + if type(input) is list: + return [_encode(n) for n in input] + +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 1e2e1e0..f3a4f30 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -2,6 +2,7 @@ + %include exception.i + + %{ ++ + #include + #include + #include +@@ -286,8 +287,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %ignore sentencepiece::SentencePieceProcessor::status; + %ignore sentencepiece::ImmutableSentencePieceText::mutable_proto; + %ignore sentencepiece::ImmutableSentencePieceText::pieces() const; ++%ignore sentencepiece::ImmutableSentencePieceText::ConvertToUnicodeSpans; + %ignore sentencepiece::ImmutableNBestSentencePieceText::mutable_proto; + %ignore sentencepiece::ImmutableNBestSentencePieceText::nbests() const; ++%ignore sentencepiece::ImmutableNBestSentencePieceText::ConvertToUnicodeSpans; + + %ignore sentencepiece::SentencePieceProcessor::Encode; + %ignore sentencepiece::SentencePieceProcessor::SampleEncode; +@@ -481,6 +484,13 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + sentencepiece::util::bytes); + } + ++ std::vector ++ _DecodeIdsAsImmutableProtoBatch( ++ const std::vector> &ins, int num_threads) const { ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsImmutableProto, int, ++ sentencepiece::ImmutableSentencePieceText); ++ } ++ + std::vector _DecodePiecesBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); +@@ -852,6 +862,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return self._NBestEncodeAsImmutableProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + ++ raise RuntimeError('unknown out_type') ++ + if type(input) is list: + return [_encode(n) for n in input] + +@@ -934,10 +946,21 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + if out_type is int: + return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) +- else: ++ if out_type is str: + return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best, + add_bos, add_eos, reverse, emit_unk_piece) + ++ if out_type == 'serialized_proto' or out_type == 'proto': ++ return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ ++ if out_type == 'immutable_proto': ++ return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best, ++ add_bos, add_eos, reverse, emit_unk_piece) ++ ++ raise RuntimeError('unknown output type') ++ ++ + if type(input) is list: + return [_encode(n) for n in input] + +@@ -1187,7 +1210,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + + %extend sentencepiece::ImmutableSentencePieceText { +- ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const { ++ ImmutableSentencePieceText_ImmutableSentencePiece _pieces(int index) const { + if (index < 0 || index >= static_cast($self->pieces_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, +@@ -1197,19 +1220,25 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + + %pythoncode { ++ def pieces(self, i): ++ return self._pieces(i) ++ + def __len__(self): + return self.pieces_size() + + def __getitem__(self, i): +- return self.pieces(i) ++ return self._pieces(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() ++ ++ def __hash__(self): ++ return hash(self.SerializeAsString()) + } + } + + %extend sentencepiece::ImmutableNBestSentencePieceText { +- ImmutableSentencePieceText nbests(int index) const { ++ ImmutableSentencePieceText _nbests(int index) const { + if (index < 0 || index >= static_cast($self->nbests_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, +@@ -1219,14 +1248,20 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + + %pythoncode { ++ def __nbests__(self, i): ++ return self._nbests(i) ++ + def __len__(self): + return self.nbests_size() + + def __getitem__(self, i): +- return self.nbests(i) ++ return self._nbests(i) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() ++ ++ def __hash__(self): ++ return hash(self.SerializeAsString()) + } + } + +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 9776b0f..22e0708 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -2811,6 +2811,7 @@ namespace swig { + } + + ++ + #include + #include + #include +@@ -3132,16 +3133,6 @@ SWIG_From_size_t (size_t value) + } + + +- #define SWIG_From_double PyFloat_FromDouble +- +- +-SWIGINTERNINLINE PyObject * +-SWIG_From_float (float value) +-{ +- return SWIG_From_double (value); +-} +- +- + SWIGINTERN int + SWIG_AsVal_double (PyObject *obj, double *val) + { +@@ -3282,7 +3273,17 @@ SWIG_AsVal_int (PyObject * obj, int *val) + return res; + } + +-SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sentencepiece_ImmutableSentencePieceText_pieces(sentencepiece::ImmutableSentencePieceText const *self,int index){ ++ ++ #define SWIG_From_double PyFloat_FromDouble ++ ++ ++SWIGINTERNINLINE PyObject * ++SWIG_From_float (float value) ++{ ++ return SWIG_From_double (value); ++} ++ ++SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sentencepiece_ImmutableSentencePieceText__pieces(sentencepiece::ImmutableSentencePieceText const *self,int index){ + if (index < 0 || index >= static_cast(self->pieces_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, +@@ -3290,7 +3291,7 @@ SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sent + } + return self->pieces(index); + } +-SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_ImmutableNBestSentencePieceText_nbests(sentencepiece::ImmutableNBestSentencePieceText const *self,int index){ ++SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_ImmutableNBestSentencePieceText__nbests(sentencepiece::ImmutableNBestSentencePieceText const *self,int index){ + if (index < 0 || index >= static_cast(self->nbests_size())) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, +@@ -3590,6 +3591,10 @@ SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodeIdsAsSerialize + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, + sentencepiece::util::bytes); + } ++SWIGINTERN std::vector< sentencepiece::ImmutableSentencePieceText > sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ ++ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsImmutableProto, int, ++ sentencepiece::ImmutableSentencePieceText); ++ } + SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< absl::string_view > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); + } +@@ -4070,6 +4075,44 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; ++ int arg2 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText_pieces", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); ++ { ++ try { ++ result = ((sentencepiece::ImmutableSentencePieceText const *)arg1)->pieces(arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(static_cast< const sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_text(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; +@@ -4168,7 +4211,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + int arg2 ; +@@ -4179,20 +4222,20 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSE + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; + +- if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText_pieces", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText__pieces", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "2"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { + try { +- result = sentencepiece_ImmutableSentencePieceText_pieces((sentencepiece::ImmutableSentencePieceText const *)arg1,arg2); ++ result = sentencepiece_ImmutableSentencePieceText__pieces((sentencepiece::ImmutableSentencePieceText const *)arg1,arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -4299,6 +4342,44 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; ++ int arg2 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val2 ; ++ int ecode2 = 0 ; ++ PyObject *swig_obj[2] ; ++ sentencepiece::ImmutableSentencePieceText result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText_nbests", 2, 2, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); ++ ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); ++ if (!SWIG_IsOK(ecode2)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "2"" of type '" "int""'"); ++ } ++ arg2 = static_cast< int >(val2); ++ { ++ try { ++ result = ((sentencepiece::ImmutableNBestSentencePieceText const *)arg1)->nbests(arg2); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); ++ return resultobj; ++fail: ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_SerializeAsString(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; +@@ -4332,7 +4413,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText__nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + int arg2 ; +@@ -4343,20 +4424,20 @@ SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIG + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText result; + +- if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText_nbests", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText__nbests", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "2"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { + try { +- result = sentencepiece_ImmutableNBestSentencePieceText_nbests((sentencepiece::ImmutableNBestSentencePieceText const *)arg1,arg2); ++ result = sentencepiece_ImmutableNBestSentencePieceText__nbests((sentencepiece::ImmutableNBestSentencePieceText const *)arg1,arg2); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { +@@ -6822,6 +6903,87 @@ fail: + } + + ++SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++ PyObject *resultobj = 0; ++ sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; ++ std::vector< std::vector< int > > *arg2 = 0 ; ++ int arg3 ; ++ void *argp1 = 0 ; ++ int res1 = 0 ; ++ int val3 ; ++ int ecode3 = 0 ; ++ PyObject *swig_obj[3] ; ++ SwigValueWrapper< std::vector< sentencepiece::ImmutableSentencePieceText > > result; ++ ++ if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch", 3, 3, swig_obj)) SWIG_fail; ++ res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); ++ if (!SWIG_IsOK(res1)) { ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); ++ } ++ arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); ++ { ++ std::vector> *out = nullptr; ++ if (PyList_Check(swig_obj[1])) { ++ const size_t size = PyList_Size(swig_obj[1]); ++ out = new std::vector>(size); ++ for (size_t i = 0; i < size; ++i) { ++ PyObject *o = PyList_GetItem(swig_obj[1], i); ++ if (PyList_Check(o)) { ++ const size_t size2 = PyList_Size(o); ++ (*out)[i].resize(size2); ++ for (size_t j = 0; j < size2; ++j) { ++ PyObject *o2 = PyList_GetItem(o, j); ++ if (PyInt_Check(o2)) { ++ (*out)[i][j] = static_cast(PyInt_AsLong(o2)); ++ } else { ++ PyErr_SetString(PyExc_TypeError, "list must contain strings"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError, "not a list"); ++ SWIG_fail; ++ } ++ } ++ } else { ++ PyErr_SetString(PyExc_TypeError,"not a list"); ++ SWIG_fail; ++ } ++ arg2 = out; ++ } ++ ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); ++ if (!SWIG_IsOK(ecode3)) { ++ SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch" "', argument " "3"" of type '" "int""'"); ++ } ++ arg3 = static_cast< int >(val3); ++ { ++ try { ++ result = sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< int > > const &)*arg2,arg3); ++ ReleaseResultObject(resultobj); ++ } ++ catch (const sentencepiece::util::Status &status) { ++ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); ++ } ++ } ++ { ++ resultobj = PyList_New((&result)->size()); ++ for (size_t i = 0; i < (&result)->size(); ++i) { ++ PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText((&result)->at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0); ++ PyList_SET_ITEM(resultobj, i, obj); ++ } ++ } ++ { ++ delete arg2; ++ } ++ return resultobj; ++fail: ++ { ++ delete arg2; ++ } ++ return NULL; ++} ++ ++ + SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; +@@ -8298,17 +8460,19 @@ static PyMethodDef SwigMethods[] = { + { "new_ImmutableSentencePieceText", _wrap_new_ImmutableSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableSentencePieceText", _wrap_delete_ImmutableSentencePieceText, METH_O, NULL}, + { "ImmutableSentencePieceText_pieces_size", _wrap_ImmutableSentencePieceText_pieces_size, METH_O, NULL}, ++ { "ImmutableSentencePieceText_pieces", _wrap_ImmutableSentencePieceText_pieces, METH_VARARGS, NULL}, + { "ImmutableSentencePieceText_text", _wrap_ImmutableSentencePieceText_text, METH_O, NULL}, + { "ImmutableSentencePieceText_score", _wrap_ImmutableSentencePieceText_score, METH_O, NULL}, + { "ImmutableSentencePieceText_SerializeAsString", _wrap_ImmutableSentencePieceText_SerializeAsString, METH_O, NULL}, +- { "ImmutableSentencePieceText_pieces", _wrap_ImmutableSentencePieceText_pieces, METH_VARARGS, NULL}, ++ { "ImmutableSentencePieceText__pieces", _wrap_ImmutableSentencePieceText__pieces, METH_VARARGS, NULL}, + { "ImmutableSentencePieceText_swigregister", ImmutableSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableSentencePieceText_swiginit", ImmutableSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_ImmutableNBestSentencePieceText", _wrap_new_ImmutableNBestSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableNBestSentencePieceText", _wrap_delete_ImmutableNBestSentencePieceText, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_nbests_size", _wrap_ImmutableNBestSentencePieceText_nbests_size, METH_O, NULL}, +- { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_nbests", _wrap_ImmutableNBestSentencePieceText_nbests, METH_VARARGS, NULL}, ++ { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText__nbests", _wrap_ImmutableNBestSentencePieceText__nbests, METH_VARARGS, NULL}, + { "ImmutableNBestSentencePieceText_swigregister", ImmutableNBestSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_swiginit", ImmutableNBestSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_SentencePieceProcessor", _wrap_new_SentencePieceProcessor, METH_NOARGS, NULL}, +@@ -8350,6 +8514,7 @@ static PyMethodDef SwigMethods[] = { + { "SentencePieceProcessor__DecodePiecesAsImmutableProto", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsBatch", _wrap_SentencePieceProcessor__DecodeIdsBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch, METH_VARARGS, NULL}, ++ { "SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesBatch", _wrap_SentencePieceProcessor__DecodePiecesBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch, METH_VARARGS, NULL}, +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 2f2c84a..5e4af7f 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -266,6 +266,13 @@ class TestSentencepieceProcessor(unittest.TestCase): + t4 = self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar']) + t5 = self.sp_.decode_ids_as_serialized_proto([20, 30]) + ++ y1 = self.sp_.encode(text, out_type='serialized_proto') ++ y2 = self.sp_.encode( ++ text, enable_sampling=True, out_type='serialized_proto') ++ y3 = self.sp_.nbest_encode(text, out_type='serialized_proto', nbest_size=10) ++ y4 = self.sp_.decode(['foo', 'bar'], out_type='serialized_proto') ++ y5 = self.sp_.decode([20, 30], out_type='serialized_proto') ++ + self.assertEqual(type(s1), bytes) + self.assertEqual(type(s2), bytes) + self.assertEqual(type(t2), bytes) +@@ -277,6 +284,92 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual(s3, t3) + self.assertEqual(s4, t4) + self.assertEqual(s5, t5) ++ self.assertEqual(s1, y1) ++ self.assertEqual(s3, y3) ++ self.assertEqual(s4, y4) ++ self.assertEqual(s5, y5) ++ ++ ids = self.jasp_.EncodeAsIds(text) ++ pieces = self.jasp_.EncodeAsPieces(text) ++ s1 = self.jasp_.EncodeAsSerializedProto(text) ++ s2 = self.jasp_.DecodeIdsAsSerializedProto(ids) ++ s3 = self.jasp_.DecodePiecesAsSerializedProto(ids) ++ self.assertEqual(s2, s1) ++ self.assertEqual(s3, s1) ++ ++ def test_immutable_proto(self): ++ text = 'I saw a girl with a telescope.' ++ s1 = self.sp_.EncodeAsImmutableProto(text) ++ s2 = self.sp_.SampleEncodeAsImmutableProto(text, 10, 0.2) ++ s3 = self.sp_.NBestEncodeAsImmutableProto(text, 10) ++ s4 = self.sp_.DecodePiecesAsImmutableProto(['foo', 'bar']) ++ s5 = self.sp_.DecodeIdsAsImmutableProto([20, 30]) ++ ++ t1 = self.sp_.encode_as_immutable_proto(text) ++ t2 = self.sp_.sample_encode_as_immutable_proto(text, 10, 0.2) ++ t3 = self.sp_.nbest_encode_as_immutable_proto(text, 10) ++ t4 = self.sp_.decode_pieces_as_immutable_proto(['foo', 'bar']) ++ t5 = self.sp_.decode_ids_as_immutable_proto([20, 30]) ++ ++ y1 = self.sp_.encode(text, out_type='immutable_proto') ++ y2 = self.sp_.encode(text, enable_sampling=True, out_type='immutable_proto') ++ y3 = self.sp_.nbest_encode(text, out_type='immutable_proto', nbest_size=10) ++ y4 = self.sp_.decode(['foo', 'bar'], out_type='immutable_proto') ++ y5 = self.sp_.decode([20, 30], out_type='immutable_proto') ++ ++ self.assertEqual(s1, t1) ++ self.assertEqual(s3, t3) ++ self.assertEqual(s4, t4) ++ self.assertEqual(s5, t5) ++ self.assertEqual(s1, y1) ++ self.assertEqual(s3, y3) ++ self.assertEqual(s4, y4) ++ self.assertEqual(s5, y5) ++ ++ x1 = self.sp_.encode_as_serialized_proto(text) ++ x2 = self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2) ++ x3 = self.sp_.nbest_encode_as_serialized_proto(text, 10) ++ x4 = self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar']) ++ x5 = self.sp_.decode_ids_as_serialized_proto([20, 30]) ++ ++ self.assertEqual(x1, t1.SerializeAsString()) ++ self.assertEqual(x3, t3.SerializeAsString()) ++ self.assertEqual(x4, t4.SerializeAsString()) ++ self.assertEqual(x5, t5.SerializeAsString()) ++ ++ v1 = self.sp_.EncodeAsIds(text) ++ v2 = self.sp_.EncodeAsPieces(text) ++ self.assertEqual([x.id() for x in s1], v1) ++ self.assertEqual([x.piece() for x in s1], v2) ++ self.assertEqual(text, s1.text()) ++ ++ surfaces1 = [s1.text()[x.begin():x.end()] for x in s1] ++ surfaces2 = [x.surface() for x in s1] ++ self.assertEqual(surfaces1, surfaces2) ++ ++ ids = [] ++ for i in range(s1.pieces_size()): ++ ids.append(s1.pieces(i).id()) ++ self.assertEqual(ids, v1) ++ ++ pieces = [] ++ for i in range(s1.pieces_size()): ++ pieces.append(s1.pieces(i).piece()) ++ self.assertEqual(pieces, v2) ++ ++ # Japanese offset ++ s1 = self.jasp_.EncodeAsImmutableProto('吾輩は猫である。Hello world. ABC 123') ++ surfaces1 = [s1.text()[x.begin():x.end()] for x in s1] ++ surfaces2 = [x.surface() for x in s1] ++ self.assertEqual(surfaces1, surfaces2) ++ ++ ids = [x.id() for x in s1] ++ s2 = self.jasp_.DecodeIdsAsImmutableProto(ids) ++ self.assertEqual(s2, s1) ++ ++ pieces = [x.piece() for x in s1] ++ s2 = self.jasp_.DecodePiecesAsImmutableProto(pieces) ++ self.assertEqual(s2, s1) + + def test_new_api(self): + sp = spm.SentencePieceProcessor( +@@ -386,49 +479,102 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual(pieces, sp.encode(text, add_bos=False, add_eos=True)) + + def test_sampling(self): +- sp = spm.SentencePieceProcessor( +- model_file=os.path.join('test', 'test_model.model'), +- out_type=str, +- enable_sampling=True) +- ids = defaultdict(int) +- for n in range(100): +- ++ids[' '.join(sp.encode('hello world'))] +- self.assertGreater(len(ids), 1) +- +- ids2 = defaultdict(int) +- for n in range(100): +- ++ids2[' '.join(sp.encode('hello world', enable_sampling=False))] +- self.assertEqual(len(ids2), 1) ++ sp = self.sp_ ++ ++ for out_type in [str, int, 'serialized_proto', 'immutable_proto']: ++ ids = defaultdict(int) ++ for n in range(100): ++ out = sp.encode('hello world', out_type=out_type, enable_sampling=True) ++ if type(out) is list: ++ out = tuple(out) ++ ++ids[out] ++ self.assertGreater(len(ids), 1) ++ ++ ids2 = defaultdict(int) ++ for n in range(100): ++ out = sp.encode('hello world', out_type=out_type, enable_sampling=False) ++ if type(out) is list: ++ out = tuple(out) ++ ++ids2[out] ++ self.assertEqual(len(ids2), 1) ++ ++ out = sp.encode(['hello world', 'this is a test'], ++ out_type=out_type, ++ enable_sampling=True) ++ self.assertEqual(len(out), 2) ++ out = sp.encode(['hello world', 'this is a test'], ++ out_type=out_type, ++ enable_sampling=False) ++ self.assertEqual(len(out), 2) + + def test_nbest(self): +- sp = spm.SentencePieceProcessor( +- model_file=os.path.join('test', 'test_model.model')) ++ sp = self.sp_ + text = 'hello world' +- results = sp.nbest_encode(text, nbest_size=10, out_type=str) +- self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=str)) +- for n in results: +- self.assertEqual(sp.decode(n), text) +- decoded = sp.decode(results) +- for n in decoded: +- self.assertEqual(n, text) +- results = sp.nbest_encode(text, nbest_size=10, out_type=int) +- self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=int)) +- for n in results: +- self.assertEqual(sp.decode(n), text) +- decoded = sp.decode(results) +- for n in decoded: +- self.assertEqual(n, text) ++ text2 = 'I have a pen.' ++ ++ for out_type in [str, int, 'serialized_proto', 'immutable_proto']: ++ results = sp.nbest_encode(text, nbest_size=10, out_type=out_type) ++ self.assertEqual(results, ++ sp.NBestEncode(text, nbest_size=10, out_type=out_type)) ++ ++ if out_type in [str, int]: ++ for n in results: ++ self.assertEqual(sp.decode(n), text) ++ ++ for n in sp.decode(results): ++ self.assertEqual(n, text) ++ ++ # batch test ++ results = sp.nbest_encode([text, text2], nbest_size=10, out_type=out_type) ++ self.assertEqual( ++ results, ++ sp.NBestEncode([text, text2], nbest_size=10, out_type=out_type)) ++ self.assertEqual(len(results), 2) ++ ++ if out_type in [str, int]: ++ for n in results[0]: ++ self.assertEqual(sp.decode(n), text) ++ ++ for n in results[1]: ++ self.assertEqual(sp.decode(n), text2) ++ ++ decoded = sp.decode(results[0]) ++ self.assertEqual(len(decoded), 10) ++ for n in decoded: ++ self.assertEqual(n, text) ++ decoded = sp.decode(results[1]) ++ self.assertEqual(len(decoded), 10) ++ for n in decoded: ++ self.assertEqual(n, text2) + + def test_sample_and_score(self): +- sp = spm.SentencePieceProcessor( +- model_file=os.path.join('test', 'test_model.model')) ++ sp = self.sp_ + text = 'hello world' +- results = sp.sample_encode_and_score(text, wor=True, out_type=str) +- for n in results: +- self.assertEqual(sp.decode(n[0]), text) +- results = sp.sample_encode_and_score(text, wor=True, out_type=int) +- for n in results: +- self.assertEqual(sp.decode(n[0]), text) ++ text2 = 'I have a pen.' ++ for out_type in [str, int, 'serialized_proto', 'immutable_proto']: ++ results = sp.sample_encode_and_score( ++ text, wor=True, num_samples=10, out_type=out_type) ++ results = sp.SampleEncodeAndScore( ++ text, wor=False, num_samples=10, out_type=out_type) ++ ++ if out_type in [str, int]: ++ for n in results: ++ self.assertEqual(sp.decode(n[0]), text) ++ ++ results = sp.sample_encode_and_score([text, text2], ++ wor=True, ++ num_samples=10, ++ out_type=out_type) ++ results = sp.SampleEncodeAndScore([text, text2], ++ wor=True, ++ num_samples=10, ++ out_type=out_type) ++ ++ if out_type in [str, int]: ++ for n in results[0]: ++ self.assertEqual(sp.decode(n[0]), text) ++ for n in results[1]: ++ self.assertEqual(sp.decode(n[0]), text2) + + def test_valid_range(self): + size = self.sp_.piece_size() +@@ -452,65 +598,28 @@ class TestSentencepieceProcessor(unittest.TestCase): + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: + texts = file.readlines() + +- r1 = sp.encode(texts, out_type=str, num_threads=None) +- r2 = sp.encode(texts, out_type=str, num_threads=1) +- r3 = sp.encode(texts, out_type=str, num_threads=-1) +- r4 = sp.encode(texts, out_type=str, num_threads=8) +- r5 = [sp.encode(s, out_type=str) for s in texts] +- self.assertEqual(r1, r2) +- self.assertEqual(r1, r3) +- self.assertEqual(r1, r4) +- self.assertEqual(r1, r5) +- +- d1 = sp.decode(r1, num_threads=None) +- d2 = sp.decode(r2, num_threads=1) +- d3 = sp.decode(r3, num_threads=-1) +- d4 = sp.decode(r4, num_threads=8) +- d5 = [sp.decode(s) for s in r5] +- self.assertEqual(d1, d2) +- self.assertEqual(d1, d3) +- self.assertEqual(d1, d4) +- self.assertEqual(d1, d5) +- +- r1 = sp.encode(texts, out_type=int, num_threads=None) +- r2 = sp.encode(texts, out_type=int, num_threads=1) +- r3 = sp.encode(texts, out_type=int, num_threads=-1) +- r4 = sp.encode(texts, out_type=int, num_threads=8) +- r5 = [sp.encode(s, out_type=int) for s in texts] +- self.assertEqual(r1, r2) +- self.assertEqual(r1, r3) +- self.assertEqual(r1, r4) +- self.assertEqual(r1, r5) +- +- d1 = sp.decode(r1, num_threads=None) +- d2 = sp.decode(r2, num_threads=1) +- d3 = sp.decode(r3, num_threads=-1) +- d4 = sp.decode(r4, num_threads=8) +- d5 = [sp.decode(s) for s in r5] +- self.assertEqual(d1, d2) +- self.assertEqual(d1, d3) +- self.assertEqual(d1, d4) +- self.assertEqual(d1, d5) +- +- r1 = sp.encode(texts, out_type='serialized_proto', num_threads=None) +- r2 = sp.encode(texts, out_type='serialized_proto', num_threads=1) +- r3 = sp.encode(texts, out_type='serialized_proto', num_threads=-1) +- r4 = sp.encode(texts, out_type='serialized_proto', num_threads=8) +- r5 = [sp.encode(s, out_type='serialized_proto') for s in texts] +- self.assertEqual(r1, r2) +- self.assertEqual(r1, r3) +- self.assertEqual(r1, r4) +- self.assertEqual(r1, r5) +- +- r1 = sp.encode(texts, out_type='immutable_proto', num_threads=None) +- r2 = sp.encode(texts, out_type='immutable_proto', num_threads=1) +- r3 = sp.encode(texts, out_type='immutable_proto', num_threads=-1) +- r4 = sp.encode(texts, out_type='immutable_proto', num_threads=8) +- r5 = [sp.encode(s, out_type='immutable_proto') for s in texts] +- self.assertEqual(r1, r2) +- self.assertEqual(r1, r3) +- self.assertEqual(r1, r4) +- self.assertEqual(r1, r5) ++ for out_type in [str, int, 'serialized_proto', 'immutable_proto']: ++ r1 = sp.encode(texts, out_type=out_type, num_threads=None) ++ r2 = sp.encode(texts, out_type=out_type, num_threads=1) ++ r3 = sp.encode(texts, out_type=out_type, num_threads=-1) ++ r4 = sp.encode(texts, out_type=out_type, num_threads=8) ++ r5 = [sp.encode(s, out_type=out_type) for s in texts] ++ self.assertEqual(r1, r2) ++ self.assertEqual(r1, r3) ++ self.assertEqual(r1, r4) ++ self.assertEqual(r1, r5) ++ ++ if out_type in [str, int]: ++ d1 = sp.decode(r1, num_threads=None) ++ d2 = sp.decode(r2, num_threads=1) ++ d3 = sp.decode(r3, num_threads=-1) ++ d4 = sp.decode(r4, num_threads=8) ++ d5 = [sp.decode(s) for s in r5] ++ ++ self.assertEqual(d1, d2) ++ self.assertEqual(d1, d3) ++ self.assertEqual(d1, d4) ++ self.assertEqual(d1, d5) + + e1 = sp.calculate_entropy(texts, alpha=1.0, num_threads=10) + e2 = sp.CalculateEntropy(texts, alpha=1.0, num_threads=10) +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index 482a45b..2a5c399 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -55,6 +55,34 @@ std::vector ToPieceArray(const std::vector &v) { + return out; + } + ++void ConvertToUnicodeSpansInternal(SentencePieceText *spt) { ++ if (spt == nullptr) return; ++ ++ std::vector utf8_to_unicode(spt->text().size() + 1, 0); ++ absl::string_view str = spt->text(); ++ size_t prev = 0; ++ int ulen = 0; ++ while (!str.empty()) { ++ const size_t mblen = string_util::OneCharLen(str.data()); ++ for (int i = prev; i < prev + mblen; ++i) { ++ utf8_to_unicode[i] = ulen; ++ } ++ ++ulen; ++ prev += mblen; ++ str.remove_prefix(mblen); ++ } ++ utf8_to_unicode[prev] = ulen; ++ ++ auto clip = [&](int s) { ++ return std::min(std::max(0, s), utf8_to_unicode.size() - 1); ++ }; ++ ++ for (auto &piece : *(spt->mutable_pieces())) { ++ piece.set_begin(utf8_to_unicode[clip(piece.begin())]); ++ piece.set_end(utf8_to_unicode[clip(piece.end())]); ++ } ++} ++ + } // namespace + + ImmutableSentencePieceText::ImmutableSentencePieceText() +@@ -132,6 +160,10 @@ SentencePieceText *ImmutableSentencePieceText::mutable_proto() { + return rep_.get(); + } + ++void ImmutableSentencePieceText::ConvertToUnicodeSpans() { ++ ConvertToUnicodeSpansInternal(mutable_proto()); ++} ++ + util::bytes ImmutableSentencePieceText::SerializeAsString() const { + return spt_->SerializeAsString(); + } +@@ -164,6 +196,13 @@ NBestSentencePieceText *ImmutableNBestSentencePieceText::mutable_proto() { + return rep_.get(); + } + ++void ImmutableNBestSentencePieceText::ConvertToUnicodeSpans() { ++ if (!mutable_proto()) return; ++ for (auto &spt : *(mutable_proto()->mutable_nbests())) { ++ ConvertToUnicodeSpansInternal(&spt); ++ } ++} ++ + util::bytes ImmutableNBestSentencePieceText::SerializeAsString() const { + return rep_ ? rep_->SerializeAsString() : ""; + } +@@ -1048,34 +1087,6 @@ std::string SentencePieceProcessor::serialized_model_proto() const { + // std::random_device. + void SetRandomGeneratorSeed(unsigned int seed); + +-void ConvertToUnicodeSpans(SentencePieceText *spt) { +- if (spt == nullptr) return; +- +- std::vector utf8_to_unicode(spt->text().size() + 1, 0); +- absl::string_view str = spt->text(); +- size_t prev = 0; +- int ulen = 0; +- while (!str.empty()) { +- const size_t mblen = string_util::OneCharLen(str.data()); +- for (int i = prev; i < prev + mblen; ++i) { +- utf8_to_unicode[i] = ulen; +- } +- ++ulen; +- prev += mblen; +- str.remove_prefix(mblen); +- } +- utf8_to_unicode[prev] = ulen; +- +- auto clip = [&](int s) { +- return std::min(std::max(0, s), utf8_to_unicode.size() - 1); +- }; +- +- for (auto &piece : *(spt->mutable_pieces())) { +- piece.set_begin(utf8_to_unicode[clip(piece.begin())]); +- piece.set_end(utf8_to_unicode[clip(piece.end())]); +- } +-} +- + namespace io { + util::Status LoadModelProto(absl::string_view filename, + ModelProto *model_proto) { +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index b7fae6a..d107a2a 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -25,8 +25,8 @@ + #ifndef SWIG + namespace absl { + using std::string_view; +-} +-#endif // SWIG ++} // namespace absl ++#endif + + namespace sentencepiece { + namespace util { +@@ -196,6 +196,9 @@ class ImmutableSentencePieceText { + // it returns the raw pointer managed by the shared_ptr. + SentencePieceText *mutable_proto(); + ++ // Converts the utf8 byte spans into Unicode char span. ++ void ConvertToUnicodeSpans(); ++ + friend class ImmutableNBestSentencePieceText; + + private: +@@ -225,6 +228,8 @@ class ImmutableNBestSentencePieceText { + // it returns the raw pointer managed by the shared_ptr. + NBestSentencePieceText *mutable_proto(); + ++ void ConvertToUnicodeSpans(); ++ + private: + std::shared_ptr rep_; + }; +@@ -415,14 +420,16 @@ class SentencePieceProcessor { + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; + +-#ifdef SWIG ++#ifdef SWIGPYTHON ++#define CONVERT_TO_UNICODE_SPAN output.ConvertToUnicodeSpans(); + #define SPP_SWIG_CHECK_AND_THROW \ + if (!status.ok()) throw status; + #else ++#define CONVERT_TO_UNICODE_SPAN + #define SPP_SWIG_CHECK_AND_THROW \ + if (!status.ok()) { \ + } +-#endif // SWIG ++#endif // SWIGPYTHON + + #define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ + OutType output; \ +@@ -439,6 +446,7 @@ class SentencePieceProcessor { + #define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ ++ CONVERT_TO_UNICODE_SPAN; \ + SPP_SWIG_CHECK_AND_THROW; \ + return output; + +@@ -707,9 +715,6 @@ class SentencePieceProcessor { + // std::random_device. + void SetRandomGeneratorSeed(unsigned int seed); + +-// Converts the utf8 byte spans into Unicode char span. +-void ConvertToUnicodeSpans(SentencePieceText *spt); +- + #ifndef SWIG + // IO related functions to absorb model formats. + namespace io { +diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc +index ff55aeb..f05dc5d 100644 +--- a/src/sentencepiece_processor_test.cc ++++ b/src/sentencepiece_processor_test.cc +@@ -1657,11 +1657,12 @@ TEST(SentencePieceProcessorTest, ImmutableNBestSentencePieceTextTest) { + + TEST(SentencePieceProcessorTest, ConvertToUnicodeSpansTest) { + auto make_spt = [&](const std::vector &tokens) { +- SentencePieceText spt; ++ ImmutableSentencePieceText ispt; ++ auto *spt = ispt.mutable_proto(); + int prev = 0; + std::string text; + for (const auto &tok : tokens) { +- auto *piece = spt.add_pieces(); ++ auto *piece = spt->add_pieces(); + piece->set_surface(tok); + piece->set_piece(tok); + piece->set_begin(prev); +@@ -1669,9 +1670,9 @@ TEST(SentencePieceProcessorTest, ConvertToUnicodeSpansTest) { + prev += tok.size(); + text += tok; + } +- spt.set_text(text); +- ConvertToUnicodeSpans(&spt); +- return spt; ++ spt->set_text(text); ++ ispt.ConvertToUnicodeSpans(); ++ return ispt; + }; + + { diff --git a/patches/0014-Adds-SWIGPYTHON-flag.patch b/patches/0014-Adds-SWIGPYTHON-flag.patch new file mode 100644 index 0000000..db1001a --- /dev/null +++ b/patches/0014-Adds-SWIGPYTHON-flag.patch @@ -0,0 +1,44 @@ +From: Taku Kudo +Date: Wed, 3 Aug 2022 12:45:31 +0900 +Subject: Adds SWIGPYTHON flag + +Signed-off-by: Kentaro Hayashi +--- + python/setup.py | 3 ++- + python/src/sentencepiece/__init__.py | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/python/setup.py b/python/setup.py +index fdf9394..3438ddd 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -96,6 +96,7 @@ class build_ext(_build_ext): + else: + cflags.append('-Wl,-strip-all') + libs.append('-Wl,-strip-all') ++ cflags.append('-DSWIGPYTHON') + print('## cflags={}'.format(' '.join(cflags))) + print('## libs={}'.format(' '.join(libs))) + ext.extra_compile_args = cflags +@@ -115,7 +116,7 @@ if os.name == 'nt': + '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch) + ] + else: +- cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include'] ++ cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include', '/DSWIGPYTHON'] + libs = [ + '..\\build\\root\\lib\\sentencepiece.lib', + '..\\build\\root\\lib\\sentencepiece_train.lib' +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index 07acb94..2a91022 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -126,7 +126,7 @@ class ImmutableSentencePieceText(object): + return self.SerializeAsString() == other.SerializeAsString() + + def __hash__(self): +- return hash(self.SerializeAsString()) ++ return hash(self.SerializeAsString()) + + + # Register ImmutableSentencePieceText in _sentencepiece: diff --git a/patches/0015-remove-unused-ifdef-SWIG-macro.patch b/patches/0015-remove-unused-ifdef-SWIG-macro.patch new file mode 100644 index 0000000..3f9fe91 --- /dev/null +++ b/patches/0015-remove-unused-ifdef-SWIG-macro.patch @@ -0,0 +1,137 @@ +From: Taku Kudo +Date: Wed, 3 Aug 2022 15:45:09 +0900 +Subject: remove unused ifdef SWIG macro + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/sentencepiece.i | 5 ++++ + src/sentencepiece_processor.h | 42 ++++++++++++++++++-------------- + 2 files changed, 29 insertions(+), 18 deletions(-) + +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index f3a4f30..75f62c8 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -326,6 +326,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %ignore sentencepiece::SentencePieceProcessor::model_proto; + %ignore sentencepiece::SentencePieceProcessor::Load; + %ignore sentencepiece::SentencePieceProcessor::LoadOrDie; ++%ignore sentencepiece::SentencePieceProcessor::SetModel; ++%ignore sentencepiece::SentencePieceProcessor::SetNormalizer; + %ignore sentencepiece::pretokenizer::PretokenizerForTrainingInterface; + %ignore sentencepiece::SentenceIterator; + %ignore sentencepiece::ConvertToUnicodeSpans; +@@ -339,6 +341,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + %ignore sentencepiece::SentencePieceTrainer::SetPretokenizerForTraining; + %ignore sentencepiece::SentencePieceTrainer::GetPretokenizerForTraining; + ++%ignore sentencepiece::io::LoadModelProto; ++%ignore sentencepiece::io::SaveModelProto; ++ + %extend sentencepiece::SentencePieceProcessor { + sentencepiece::util::Status LoadFromFile(absl::string_view arg) { + return $self->Load(arg); +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index d107a2a..be9449e 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -26,7 +26,7 @@ + namespace absl { + using std::string_view; + } // namespace absl +-#endif ++#endif // SWIG + + namespace sentencepiece { + namespace util { +@@ -420,36 +420,46 @@ class SentencePieceProcessor { + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; + +-#ifdef SWIGPYTHON +-#define CONVERT_TO_UNICODE_SPAN output.ConvertToUnicodeSpans(); +-#define SPP_SWIG_CHECK_AND_THROW \ +- if (!status.ok()) throw status; ++#ifndef SWIGPYTHON ++ ++#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ ++ OutType output; \ ++ const auto status = FuncName(__VA_ARGS__, &output); \ ++ return output; ++ ++#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ ++ OutType output; \ ++ const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ ++ return output.SerializeAsString(); ++ ++#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ ++ OutType output; \ ++ const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ ++ return output; ++ + #else +-#define CONVERT_TO_UNICODE_SPAN +-#define SPP_SWIG_CHECK_AND_THROW \ +- if (!status.ok()) { \ +- } +-#endif // SWIGPYTHON + + #define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, &output); \ +- SPP_SWIG_CHECK_AND_THROW; \ ++ if (!status.ok()) throw status; \ + return output; + + #define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- SPP_SWIG_CHECK_AND_THROW; \ ++ if (!status.ok()) throw status; \ + return output.SerializeAsString(); + + #define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- CONVERT_TO_UNICODE_SPAN; \ +- SPP_SWIG_CHECK_AND_THROW; \ ++ if (!status.ok()) throw status; \ ++ output.ConvertToUnicodeSpans(); \ + return output; + ++#endif // SWIGPYTHON ++ + ////////////////////////////////////////////////////////////// + // Handy methods that return the result directly. + // These functions ignore internal errors. +@@ -664,7 +674,6 @@ class SentencePieceProcessor { + // Returns PAD () id. + virtual int pad_id() const; + +-#ifndef SWIG + ////////////////////////////////////////////////////////////// + // Model management. + // +@@ -673,7 +682,6 @@ class SentencePieceProcessor { + + // Allows injection of a normalizer instance. `normalizer` is moved. + void SetNormalizer(std::unique_ptr &&normalizer); +-#endif // SWIG + + // Returns immutable model proto. Useful to obtain extended + // or experimental parameters encoded in model_proto. +@@ -715,7 +723,6 @@ class SentencePieceProcessor { + // std::random_device. + void SetRandomGeneratorSeed(unsigned int seed); + +-#ifndef SWIG + // IO related functions to absorb model formats. + namespace io { + // Loads `model_proto` from `filename`. +@@ -730,6 +737,5 @@ util::Status LoadModelProto(absl::string_view, ModelProto *model_proto); + // Saves `model_proto` as `filename`. + util::Status SaveModelProto(absl::string_view, const ModelProto &model_proto); + } // namespace io +-#endif // SWIG + } // namespace sentencepiece + #endif // SENTENCEPIECE_PROCESSOR_H_ diff --git a/patches/0016-Fixed-test-failure.patch b/patches/0016-Fixed-test-failure.patch new file mode 100644 index 0000000..da01d15 --- /dev/null +++ b/patches/0016-Fixed-test-failure.patch @@ -0,0 +1,281 @@ +From: Taku Kudo +Date: Wed, 3 Aug 2022 17:20:01 +0900 +Subject: Fixed test failure. + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/sentencepiece.i | 35 +++++++++++++++++++++---- + python/src/sentencepiece/sentencepiece_wrap.cxx | 35 +++++++++++++++++++++---- + src/sentencepiece_processor.cc | 4 +-- + src/sentencepiece_processor.h | 34 +++++++----------------- + 4 files changed, 72 insertions(+), 36 deletions(-) + +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 75f62c8..1a94fef 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -193,6 +193,19 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { + + inline void CheckIds(const std::vector &ids, int num_pieces) {} + ++template ++inline void ConvertToUnicodeSpans(T *proto) {} ++ ++template <> ++inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) { ++ proto->ConvertToUnicodeSpans(); ++} ++ ++template <> ++inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) { ++ proto->ConvertToUnicodeSpans(); ++} ++ + class ThreadPool { + public: + explicit ThreadPool(size_t request_size) : +@@ -239,6 +252,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + self->FuncName(ins[i]); \ + RewriteIds(*self, &out, add_bos, add_eos, reverse, \ + emit_unk_piece); \ ++ ConvertToUnicodeSpans(&out); \ + outs[i] = std::move(out); \ + } \ + }); \ +@@ -255,7 +269,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + CheckIds(ins[i], self->GetPieceSize()); \ +- outs[i] = self->FuncName(ins[i]); \ ++ auto out = self->FuncName(ins[i]); \ ++ ConvertToUnicodeSpans(&out); \ ++ outs[i] = std::move(out); \ + } \ + }); \ + } \ +@@ -396,6 +412,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + auto proto = enable_sampling ? + $self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : + $self->EncodeAsImmutableProto(text); ++ proto.ConvertToUnicodeSpans(); + RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } +@@ -467,13 +484,17 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto( + const std::vector &ids) const { + CheckIds(ids, $self->GetPieceSize()); +- return $self->DecodeIdsAsImmutableProto(ids); ++ auto proto = $self->DecodeIdsAsImmutableProto(ids); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + + sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto( + const std::vector &pieces) const { + CheckIds(pieces, $self->GetPieceSize()); +- return $self->DecodePiecesAsImmutableProto(pieces); ++ auto proto= $self->DecodePiecesAsImmutableProto(pieces); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + + ///////////////////////////////////////////////////////////////////////////// +@@ -557,7 +578,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); +- return $self->NBestEncodeAsImmutableProto(text, nbest_size); ++ auto proto = $self->NBestEncodeAsImmutableProto(text, nbest_size); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + + +@@ -611,8 +634,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); +- return $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, ++ auto proto = $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, + alpha, wor, include_best); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + + +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 22e0708..4b8b5ef 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -3002,6 +3002,19 @@ inline void CheckIds(const std::vector &ids, int num_pieces) { + + inline void CheckIds(const std::vector &ids, int num_pieces) {} + ++template ++inline void ConvertToUnicodeSpans(T *proto) {} ++ ++template <> ++inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) { ++ proto->ConvertToUnicodeSpans(); ++} ++ ++template <> ++inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) { ++ proto->ConvertToUnicodeSpans(); ++} ++ + class ThreadPool { + public: + explicit ThreadPool(size_t request_size) : +@@ -3048,6 +3061,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + self->FuncName(ins[i]); \ + RewriteIds(*self, &out, add_bos, add_eos, reverse, \ + emit_unk_piece); \ ++ ConvertToUnicodeSpans(&out); \ + outs[i] = std::move(out); \ + } \ + }); \ +@@ -3064,7 +3078,9 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + CheckIds(ins[i], self->GetPieceSize()); \ +- outs[i] = self->FuncName(ins[i]); \ ++ auto out = self->FuncName(ins[i]); \ ++ ConvertToUnicodeSpans(&out); \ ++ outs[i] = std::move(out); \ + } \ + }); \ + } \ +@@ -3540,6 +3556,7 @@ SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePiece + auto proto = enable_sampling ? + self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) : + self->EncodeAsImmutableProto(text); ++ proto.ConvertToUnicodeSpans(); + RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } +@@ -3578,11 +3595,15 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Deco + } + SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodeIdsAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); +- return self->DecodeIdsAsImmutableProto(ids); ++ auto proto = self->DecodeIdsAsImmutableProto(ids); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_SentencePieceProcessor__DecodePiecesAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &pieces){ + CheckIds(pieces, self->GetPieceSize()); +- return self->DecodePiecesAsImmutableProto(pieces); ++ auto proto= self->DecodePiecesAsImmutableProto(pieces); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); +@@ -3628,7 +3649,9 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__NBes + SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__NBestEncodeAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); +- return self->NBestEncodeAsImmutableProto(text, nbest_size); ++ auto proto = self->NBestEncodeAsImmutableProto(text, nbest_size); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples, +@@ -3655,8 +3678,10 @@ SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__Samp + SWIGINTERN sentencepiece::ImmutableNBestSentencePieceText sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float alpha,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); +- return self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, ++ auto proto = self->SampleEncodeAndScoreAsImmutableProto(text, num_samples, + alpha, wor, include_best); ++ proto.ConvertToUnicodeSpans(); ++ return proto; + } + SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float alpha){ + return self->CalculateEntropy(text, alpha); +diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc +index 2a5c399..f0df2f6 100644 +--- a/src/sentencepiece_processor.cc ++++ b/src/sentencepiece_processor.cc +@@ -56,14 +56,14 @@ std::vector ToPieceArray(const std::vector &v) { + } + + void ConvertToUnicodeSpansInternal(SentencePieceText *spt) { +- if (spt == nullptr) return; ++ if (spt == nullptr || spt->text().empty()) return; + + std::vector utf8_to_unicode(spt->text().size() + 1, 0); + absl::string_view str = spt->text(); + size_t prev = 0; + int ulen = 0; + while (!str.empty()) { +- const size_t mblen = string_util::OneCharLen(str.data()); ++ const size_t mblen = std::max(1, string_util::OneCharLen(str.data())); + for (int i = prev; i < prev + mblen; ++i) { + utf8_to_unicode[i] = ulen; + } +diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h +index be9449e..14b1e8c 100644 +--- a/src/sentencepiece_processor.h ++++ b/src/sentencepiece_processor.h +@@ -419,47 +419,33 @@ class SentencePieceProcessor { + + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; +- +-#ifndef SWIGPYTHON +- +-#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ +- OutType output; \ +- const auto status = FuncName(__VA_ARGS__, &output); \ +- return output; +- +-#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ +- OutType output; \ +- const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- return output.SerializeAsString(); +- +-#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ +- OutType output; \ +- const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- return output; +- ++#ifdef SWIG ++#define SPP_SWIG_CHECK_AND_THROW \ ++ if (!status.ok()) throw status; + #else ++#define SPP_SWIG_CHECK_AND_THROW \ ++ if (!status.ok()) { \ ++ } ++#endif // SWIG + + #define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, &output); \ +- if (!status.ok()) throw status; \ ++ SPP_SWIG_CHECK_AND_THROW; \ + return output; + + #define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- if (!status.ok()) throw status; \ ++ SPP_SWIG_CHECK_AND_THROW; \ + return output.SerializeAsString(); + + #define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ +- if (!status.ok()) throw status; \ +- output.ConvertToUnicodeSpans(); \ ++ SPP_SWIG_CHECK_AND_THROW; \ + return output; + +-#endif // SWIGPYTHON +- + ////////////////////////////////////////////////////////////// + // Handy methods that return the result directly. + // These functions ignore internal errors. diff --git a/patches/0017-Uses-property-in-immutable-proto.patch b/patches/0017-Uses-property-in-immutable-proto.patch new file mode 100644 index 0000000..453b597 --- /dev/null +++ b/patches/0017-Uses-property-in-immutable-proto.patch @@ -0,0 +1,878 @@ +From: Taku Kudo +Date: Thu, 4 Aug 2022 16:03:31 +0900 +Subject: Uses property in immutable proto + +Signed-off-by: Kentaro Hayashi +--- + python/setup.py | 3 +- + python/src/sentencepiece/__init__.py | 128 ++++++++++++------ + python/src/sentencepiece/sentencepiece.i | 143 ++++++++++++++------ + python/src/sentencepiece/sentencepiece_wrap.cxx | 168 ++++++------------------ + python/test/sentencepiece_test.py | 68 +++++----- + 5 files changed, 265 insertions(+), 245 deletions(-) + +diff --git a/python/setup.py b/python/setup.py +index 3438ddd..fdf9394 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -96,7 +96,6 @@ class build_ext(_build_ext): + else: + cflags.append('-Wl,-strip-all') + libs.append('-Wl,-strip-all') +- cflags.append('-DSWIGPYTHON') + print('## cflags={}'.format(' '.join(cflags))) + print('## libs={}'.format(' '.join(libs))) + ext.extra_compile_args = cflags +@@ -116,7 +115,7 @@ if os.name == 'nt': + '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch) + ] + else: +- cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include', '/DSWIGPYTHON'] ++ cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include'] + libs = [ + '..\\build\\root\\lib\\sentencepiece.lib', + '..\\build\\root\\lib\\sentencepiece_train.lib' +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index 2a91022..12dc631 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -69,20 +69,36 @@ class ImmutableSentencePieceText_ImmutableSentencePiece(object): + _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece()) + __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece + +- def piece(self): +- return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_piece(self) ++ def _piece(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece(self) + +- def surface(self): +- return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_surface(self) ++ def _surface(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface(self) + +- def id(self): +- return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_id(self) ++ def _id(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__id(self) + +- def begin(self): +- return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_begin(self) ++ def _begin(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__begin(self) ++ ++ def _end(self): ++ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__end(self) ++ ++ piece = property(_piece) ++ surface = property(_surface) ++ id = property(_id) ++ begin = property(_begin) ++ end = property(_end) ++ ++ def __str__(self): ++ return ('piece: \"{}\"\n' ++ 'id: {}\n' ++ 'surface: \"{}\"\n' ++ 'begin: {}\n' ++ 'end: {}\n').format(self.piece, self.id, self.surface, ++ self.begin, self.end) ++ __repr__ = __str__ + +- def end(self): +- return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_end(self) + + # Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece: + _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) +@@ -95,32 +111,45 @@ class ImmutableSentencePieceText(object): + _sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText()) + __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText + +- def pieces_size(self): +- return _sentencepiece.ImmutableSentencePieceText_pieces_size(self) ++ def _pieces_size(self): ++ return _sentencepiece.ImmutableSentencePieceText__pieces_size(self) + +- def pieces(self, index): +- return _sentencepiece.ImmutableSentencePieceText_pieces(self, index) ++ def _pieces(self, index): ++ return _sentencepiece.ImmutableSentencePieceText__pieces(self, index) + +- def text(self): +- return _sentencepiece.ImmutableSentencePieceText_text(self) ++ def _text(self): ++ return _sentencepiece.ImmutableSentencePieceText__text(self) + +- def score(self): +- return _sentencepiece.ImmutableSentencePieceText_score(self) ++ def _score(self): ++ return _sentencepiece.ImmutableSentencePieceText__score(self) + + def SerializeAsString(self): + return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self) + +- def _pieces(self, index): +- return _sentencepiece.ImmutableSentencePieceText__pieces(self, index) ++ text = property(_text) ++ score = property(_score) + +- def pieces(self, i): +- return self._pieces(i) ++ class ImmutableSentencePieceIterator: ++ def __init__(self, proto): ++ self.proto = proto ++ self.len = self.proto._pieces_size() + +- def __len__(self): +- return self.pieces_size() ++ def __len__(self): ++ return self.len ++ ++ def __getitem__(self, index): ++ if index < 0 or index >= self.len: ++ raise IndexError('piece index is out of range') ++ return self.proto._pieces(index) ++ ++ def __str__(self): ++ return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self]) ++ ++ __repr__ = __str__ + +- def __getitem__(self, i): +- return self._pieces(i) ++ @property ++ def pieces(self): ++ return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() +@@ -128,6 +157,14 @@ class ImmutableSentencePieceText(object): + def __hash__(self): + return hash(self.SerializeAsString()) + ++ def __str__(self): ++ return ('text: \"{}\"\n' ++ 'score: {}\n' ++ '{}').format(self.text, self.score, ++ '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces])) ++ ++ __repr__ = __str__ ++ + + # Register ImmutableSentencePieceText in _sentencepiece: + _sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) +@@ -140,26 +177,36 @@ class ImmutableNBestSentencePieceText(object): + _sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText()) + __swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText + +- def nbests_size(self): +- return _sentencepiece.ImmutableNBestSentencePieceText_nbests_size(self) ++ def _nbests_size(self): ++ return _sentencepiece.ImmutableNBestSentencePieceText__nbests_size(self) + +- def nbests(self, index): +- return _sentencepiece.ImmutableNBestSentencePieceText_nbests(self, index) ++ def _nbests(self, index): ++ return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index) + + def SerializeAsString(self): + return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self) + +- def _nbests(self, index): +- return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index) ++ class ImmutableSentencePieceTextIterator: ++ def __init__(self, proto): ++ self.proto = proto ++ self.len = self.proto._nbests_size() + +- def __nbests__(self, i): +- return self._nbests(i) ++ def __len__(self): ++ return self.len + +- def __len__(self): +- return self.nbests_size() ++ def __getitem__(self, index): ++ if index < 0 or index >= self.len: ++ raise IndexError('nbests index is out of range') ++ return self.proto._nbests(index) ++ ++ def __str__(self): ++ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self]) ++ ++ __repr__ = __str__ + +- def __getitem__(self, i): +- return self._nbests(i) ++ @property ++ def nbests(self): ++ return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self) + + def __eq__(self, other): + return self.SerializeAsString() == other.SerializeAsString() +@@ -167,6 +214,11 @@ class ImmutableNBestSentencePieceText(object): + def __hash__(self): + return hash(self.SerializeAsString()) + ++ def __str__(self): ++ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests]) ++ ++ __repr__ = __str__ ++ + + # Register ImmutableNBestSentencePieceText in _sentencepiece: + _sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 1a94fef..8309fc2 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -1239,60 +1239,117 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + } + } + ++%extend sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece { ++ %rename(_piece) piece; ++ %rename(_id) id; ++ %rename(_surface) surface; ++ %rename(_begin) begin; ++ %rename(_end) end; ++ ++ %pythoncode %{ ++ piece = property(_piece) ++ surface = property(_surface) ++ id = property(_id) ++ begin = property(_begin) ++ end = property(_end) ++ ++ def __str__(self): ++ return ('piece: \"{}\"\n' ++ 'id: {}\n' ++ 'surface: \"{}\"\n' ++ 'begin: {}\n' ++ 'end: {}\n').format(self.piece, self.id, self.surface, ++ self.begin, self.end) ++ __repr__ = __str__ ++ %} ++} ++ + %extend sentencepiece::ImmutableSentencePieceText { +- ImmutableSentencePieceText_ImmutableSentencePiece _pieces(int index) const { +- if (index < 0 || index >= static_cast($self->pieces_size())) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece index is out of range."); +- } +- return $self->pieces(index); +- } ++ %rename(_text) text; ++ %rename(_score) score; ++ %rename(_pieces) pieces; ++ %rename(_pieces_size) pieces_size; ++ ++ %pythoncode %{ ++ text = property(_text) ++ score = property(_score) ++ ++ class ImmutableSentencePieceIterator: ++ def __init__(self, proto): ++ self.proto = proto ++ self.len = self.proto._pieces_size() ++ ++ def __len__(self): ++ return self.len ++ ++ def __getitem__(self, index): ++ if index < 0 or index >= self.len: ++ raise IndexError('piece index is out of range') ++ return self.proto._pieces(index) ++ ++ def __str__(self): ++ return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self]) ++ ++ __repr__ = __str__ ++ ++ @property ++ def pieces(self): ++ return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() ++ ++ def __hash__(self): ++ return hash(self.SerializeAsString()) ++ ++ def __str__(self): ++ return ('text: \"{}\"\n' ++ 'score: {}\n' ++ '{}').format(self.text, self.score, ++ '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces])) ++ ++ __repr__ = __str__ ++ %} ++} + +-%pythoncode { +- def pieces(self, i): +- return self._pieces(i) ++%extend sentencepiece::ImmutableNBestSentencePieceText { ++ %rename(_nbests) nbests; ++ %rename(_nbests_size) nbests_size; + +- def __len__(self): +- return self.pieces_size() ++ %pythoncode %{ ++ class ImmutableSentencePieceTextIterator: ++ def __init__(self, proto): ++ self.proto = proto ++ self.len = self.proto._nbests_size() + +- def __getitem__(self, i): +- return self._pieces(i) ++ def __len__(self): ++ return self.len + +- def __eq__(self, other): +- return self.SerializeAsString() == other.SerializeAsString() ++ def __getitem__(self, index): ++ if index < 0 or index >= self.len: ++ raise IndexError('nbests index is out of range') ++ return self.proto._nbests(index) + +- def __hash__(self): +- return hash(self.SerializeAsString()) +-} +-} +- +-%extend sentencepiece::ImmutableNBestSentencePieceText { +- ImmutableSentencePieceText _nbests(int index) const { +- if (index < 0 || index >= static_cast($self->nbests_size())) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "nbest index is out of range."); +- } +- return $self->nbests(index); +- } ++ def __str__(self): ++ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self]) + +-%pythoncode { +- def __nbests__(self, i): +- return self._nbests(i) ++ __repr__ = __str__ + +- def __len__(self): +- return self.nbests_size() ++ @property ++ def nbests(self): ++ return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self) ++ ++ def __eq__(self, other): ++ return self.SerializeAsString() == other.SerializeAsString() + +- def __getitem__(self, i): +- return self._nbests(i) ++ def __hash__(self): ++ return hash(self.SerializeAsString()) + +- def __eq__(self, other): +- return self.SerializeAsString() == other.SerializeAsString() ++ def __str__(self): ++ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests]) + +- def __hash__(self): +- return hash(self.SerializeAsString()) +-} ++ __repr__ = __str__ ++ %} + } + + %typemap(out) std::vector { +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 4b8b5ef..0a8df5f 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -3299,22 +3299,6 @@ SWIG_From_float (float value) + return SWIG_From_double (value); + } + +-SWIGINTERN sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece sentencepiece_ImmutableSentencePieceText__pieces(sentencepiece::ImmutableSentencePieceText const *self,int index){ +- if (index < 0 || index >= static_cast(self->pieces_size())) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "piece index is out of range."); +- } +- return self->pieces(index); +- } +-SWIGINTERN sentencepiece::ImmutableSentencePieceText sentencepiece_ImmutableNBestSentencePieceText__nbests(sentencepiece::ImmutableNBestSentencePieceText const *self,int index){ +- if (index < 0 || index >= static_cast(self->nbests_size())) { +- throw sentencepiece::util::Status( +- sentencepiece::util::StatusCode::kOutOfRange, +- "nbest index is out of range."); +- } +- return self->nbests(index); +- } + + SWIGINTERN swig_type_info* + SWIG_pchar_descriptor(void) +@@ -3846,7 +3830,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece__piece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; +@@ -3858,7 +3842,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_pie + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_piece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece__piece" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { +@@ -3880,7 +3864,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece__surface(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; +@@ -3892,7 +3876,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_sur + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_surface" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece__surface" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { +@@ -3914,7 +3898,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece__id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; +@@ -3926,7 +3910,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id( + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_id" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece__id" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { +@@ -3945,7 +3929,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece__begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; +@@ -3957,7 +3941,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_beg + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_begin" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece__begin" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { +@@ -3976,7 +3960,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece__end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *arg1 = (sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece *) 0 ; + void *argp1 = 0 ; +@@ -3988,7 +3972,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece_end" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_ImmutableSentencePiece__end" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece * >(argp1); + { +@@ -4069,7 +4053,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__pieces_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; +@@ -4081,7 +4065,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces_size(PyObject *SWIG + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces_size" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__pieces_size" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { +@@ -4100,7 +4084,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + int arg2 ; +@@ -4111,15 +4095,15 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_pieces(PyObject *SWIGUNUSE + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; + +- if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText_pieces", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText__pieces", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText_pieces" "', argument " "2"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { +@@ -4138,7 +4122,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_text(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__text(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; +@@ -4150,7 +4134,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_text(PyObject *SWIGUNUSEDP + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_text" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__text" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { +@@ -4172,7 +4156,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_score(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__score(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; + void *argp1 = 0 ; +@@ -4184,7 +4168,7 @@ SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText_score(PyObject *SWIGUNUSED + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText_score" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__score" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); + { +@@ -4236,44 +4220,6 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableSentencePieceText__pieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +- PyObject *resultobj = 0; +- sentencepiece::ImmutableSentencePieceText *arg1 = (sentencepiece::ImmutableSentencePieceText *) 0 ; +- int arg2 ; +- void *argp1 = 0 ; +- int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece result; +- +- if (!SWIG_Python_UnpackTuple(args, "ImmutableSentencePieceText__pieces", 2, 2, swig_obj)) SWIG_fail; +- res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, 0 | 0 ); +- if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "1"" of type '" "sentencepiece::ImmutableSentencePieceText const *""'"); +- } +- arg1 = reinterpret_cast< sentencepiece::ImmutableSentencePieceText * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableSentencePieceText__pieces" "', argument " "2"" of type '" "int""'"); +- } +- arg2 = static_cast< int >(val2); +- { +- try { +- result = sentencepiece_ImmutableSentencePieceText__pieces((sentencepiece::ImmutableSentencePieceText const *)arg1,arg2); +- ReleaseResultObject(resultobj); +- } +- catch (const sentencepiece::util::Status &status) { +- SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); +- } +- } +- resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece(static_cast< const sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText_ImmutableSentencePiece, SWIG_POINTER_OWN | 0 ); +- return resultobj; +-fail: +- return NULL; +-} +- +- + SWIGINTERN PyObject *ImmutableSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; +@@ -4336,7 +4282,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText__nbests_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + void *argp1 = 0 ; +@@ -4348,7 +4294,7 @@ SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests_size(PyObject + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests_size" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText__nbests_size" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + { +@@ -4367,7 +4313,7 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { ++SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText__nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; + int arg2 ; +@@ -4378,15 +4324,15 @@ SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText_nbests(PyObject *SWIG + PyObject *swig_obj[2] ; + sentencepiece::ImmutableSentencePieceText result; + +- if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText_nbests", 2, 2, swig_obj)) SWIG_fail; ++ if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText__nbests", 2, 2, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); + if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); ++ SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText_nbests" "', argument " "2"" of type '" "int""'"); ++ SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + { +@@ -4438,44 +4384,6 @@ fail: + } + + +-SWIGINTERN PyObject *_wrap_ImmutableNBestSentencePieceText__nbests(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +- PyObject *resultobj = 0; +- sentencepiece::ImmutableNBestSentencePieceText *arg1 = (sentencepiece::ImmutableNBestSentencePieceText *) 0 ; +- int arg2 ; +- void *argp1 = 0 ; +- int res1 = 0 ; +- int val2 ; +- int ecode2 = 0 ; +- PyObject *swig_obj[2] ; +- sentencepiece::ImmutableSentencePieceText result; +- +- if (!SWIG_Python_UnpackTuple(args, "ImmutableNBestSentencePieceText__nbests", 2, 2, swig_obj)) SWIG_fail; +- res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__ImmutableNBestSentencePieceText, 0 | 0 ); +- if (!SWIG_IsOK(res1)) { +- SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "1"" of type '" "sentencepiece::ImmutableNBestSentencePieceText const *""'"); +- } +- arg1 = reinterpret_cast< sentencepiece::ImmutableNBestSentencePieceText * >(argp1); +- ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); +- if (!SWIG_IsOK(ecode2)) { +- SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "ImmutableNBestSentencePieceText__nbests" "', argument " "2"" of type '" "int""'"); +- } +- arg2 = static_cast< int >(val2); +- { +- try { +- result = sentencepiece_ImmutableNBestSentencePieceText__nbests((sentencepiece::ImmutableNBestSentencePieceText const *)arg1,arg2); +- ReleaseResultObject(resultobj); +- } +- catch (const sentencepiece::util::Status &status) { +- SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); +- } +- } +- resultobj = SWIG_NewPointerObj((new sentencepiece::ImmutableSentencePieceText(static_cast< const sentencepiece::ImmutableSentencePieceText& >(result))), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0 ); +- return resultobj; +-fail: +- return NULL; +-} +- +- + SWIGINTERN PyObject *ImmutableNBestSentencePieceText_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *obj; + if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; +@@ -8475,29 +8383,27 @@ static PyMethodDef SwigMethods[] = { + { "SWIG_PyInstanceMethod_New", SWIG_PyInstanceMethod_New, METH_O, NULL}, + { "new_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_new_ImmutableSentencePieceText_ImmutableSentencePiece, METH_NOARGS, NULL}, + { "delete_ImmutableSentencePieceText_ImmutableSentencePiece", _wrap_delete_ImmutableSentencePieceText_ImmutableSentencePiece, METH_O, NULL}, +- { "ImmutableSentencePieceText_ImmutableSentencePiece_piece", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_piece, METH_O, NULL}, +- { "ImmutableSentencePieceText_ImmutableSentencePiece_surface", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_surface, METH_O, NULL}, +- { "ImmutableSentencePieceText_ImmutableSentencePiece_id", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_id, METH_O, NULL}, +- { "ImmutableSentencePieceText_ImmutableSentencePiece_begin", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_begin, METH_O, NULL}, +- { "ImmutableSentencePieceText_ImmutableSentencePiece_end", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece_end, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece__piece", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece__piece, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece__surface", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece__surface, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece__id", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece__id, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece__begin", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece__begin, METH_O, NULL}, ++ { "ImmutableSentencePieceText_ImmutableSentencePiece__end", _wrap_ImmutableSentencePieceText_ImmutableSentencePiece__end, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_swigregister", ImmutableSentencePieceText_ImmutableSentencePiece_swigregister, METH_O, NULL}, + { "ImmutableSentencePieceText_ImmutableSentencePiece_swiginit", ImmutableSentencePieceText_ImmutableSentencePiece_swiginit, METH_VARARGS, NULL}, + { "new_ImmutableSentencePieceText", _wrap_new_ImmutableSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableSentencePieceText", _wrap_delete_ImmutableSentencePieceText, METH_O, NULL}, +- { "ImmutableSentencePieceText_pieces_size", _wrap_ImmutableSentencePieceText_pieces_size, METH_O, NULL}, +- { "ImmutableSentencePieceText_pieces", _wrap_ImmutableSentencePieceText_pieces, METH_VARARGS, NULL}, +- { "ImmutableSentencePieceText_text", _wrap_ImmutableSentencePieceText_text, METH_O, NULL}, +- { "ImmutableSentencePieceText_score", _wrap_ImmutableSentencePieceText_score, METH_O, NULL}, +- { "ImmutableSentencePieceText_SerializeAsString", _wrap_ImmutableSentencePieceText_SerializeAsString, METH_O, NULL}, ++ { "ImmutableSentencePieceText__pieces_size", _wrap_ImmutableSentencePieceText__pieces_size, METH_O, NULL}, + { "ImmutableSentencePieceText__pieces", _wrap_ImmutableSentencePieceText__pieces, METH_VARARGS, NULL}, ++ { "ImmutableSentencePieceText__text", _wrap_ImmutableSentencePieceText__text, METH_O, NULL}, ++ { "ImmutableSentencePieceText__score", _wrap_ImmutableSentencePieceText__score, METH_O, NULL}, ++ { "ImmutableSentencePieceText_SerializeAsString", _wrap_ImmutableSentencePieceText_SerializeAsString, METH_O, NULL}, + { "ImmutableSentencePieceText_swigregister", ImmutableSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableSentencePieceText_swiginit", ImmutableSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_ImmutableNBestSentencePieceText", _wrap_new_ImmutableNBestSentencePieceText, METH_NOARGS, NULL}, + { "delete_ImmutableNBestSentencePieceText", _wrap_delete_ImmutableNBestSentencePieceText, METH_O, NULL}, +- { "ImmutableNBestSentencePieceText_nbests_size", _wrap_ImmutableNBestSentencePieceText_nbests_size, METH_O, NULL}, +- { "ImmutableNBestSentencePieceText_nbests", _wrap_ImmutableNBestSentencePieceText_nbests, METH_VARARGS, NULL}, +- { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, ++ { "ImmutableNBestSentencePieceText__nbests_size", _wrap_ImmutableNBestSentencePieceText__nbests_size, METH_O, NULL}, + { "ImmutableNBestSentencePieceText__nbests", _wrap_ImmutableNBestSentencePieceText__nbests, METH_VARARGS, NULL}, ++ { "ImmutableNBestSentencePieceText_SerializeAsString", _wrap_ImmutableNBestSentencePieceText_SerializeAsString, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_swigregister", ImmutableNBestSentencePieceText_swigregister, METH_O, NULL}, + { "ImmutableNBestSentencePieceText_swiginit", ImmutableNBestSentencePieceText_swiginit, METH_VARARGS, NULL}, + { "new_SentencePieceProcessor", _wrap_new_SentencePieceProcessor, METH_NOARGS, NULL}, +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 5e4af7f..ed792bd 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -305,6 +305,12 @@ class TestSentencepieceProcessor(unittest.TestCase): + s4 = self.sp_.DecodePiecesAsImmutableProto(['foo', 'bar']) + s5 = self.sp_.DecodeIdsAsImmutableProto([20, 30]) + ++ print(s1) ++ print(s2) ++ print(s3) ++ print(s4) ++ print(s5) ++ + t1 = self.sp_.encode_as_immutable_proto(text) + t2 = self.sp_.sample_encode_as_immutable_proto(text, 10, 0.2) + t3 = self.sp_.nbest_encode_as_immutable_proto(text, 10) +@@ -339,35 +345,35 @@ class TestSentencepieceProcessor(unittest.TestCase): + + v1 = self.sp_.EncodeAsIds(text) + v2 = self.sp_.EncodeAsPieces(text) +- self.assertEqual([x.id() for x in s1], v1) +- self.assertEqual([x.piece() for x in s1], v2) +- self.assertEqual(text, s1.text()) ++ self.assertEqual([x.id for x in s1.pieces], v1) ++ self.assertEqual([x.piece for x in s1.pieces], v2) ++ self.assertEqual(text, s1.text) + +- surfaces1 = [s1.text()[x.begin():x.end()] for x in s1] +- surfaces2 = [x.surface() for x in s1] ++ surfaces1 = [s1.text[x.begin:x.end] for x in s1.pieces] ++ surfaces2 = [x.surface for x in s1.pieces] + self.assertEqual(surfaces1, surfaces2) + + ids = [] +- for i in range(s1.pieces_size()): +- ids.append(s1.pieces(i).id()) ++ for i in range(len(s1.pieces)): ++ ids.append(s1.pieces[i].id) + self.assertEqual(ids, v1) + + pieces = [] +- for i in range(s1.pieces_size()): +- pieces.append(s1.pieces(i).piece()) ++ for i in range(len(s1.pieces)): ++ pieces.append(s1.pieces[i].piece) + self.assertEqual(pieces, v2) + + # Japanese offset + s1 = self.jasp_.EncodeAsImmutableProto('吾輩は猫である。Hello world. ABC 123') +- surfaces1 = [s1.text()[x.begin():x.end()] for x in s1] +- surfaces2 = [x.surface() for x in s1] ++ surfaces1 = [s1.text[x.begin:x.end] for x in s1.pieces] ++ surfaces2 = [x.surface for x in s1.pieces] + self.assertEqual(surfaces1, surfaces2) + +- ids = [x.id() for x in s1] ++ ids = [x.id for x in s1.pieces] + s2 = self.jasp_.DecodeIdsAsImmutableProto(ids) + self.assertEqual(s2, s1) + +- pieces = [x.piece() for x in s1] ++ pieces = [x.piece for x in s1.pieces] + s2 = self.jasp_.DecodePiecesAsImmutableProto(pieces) + self.assertEqual(s2, s1) + +@@ -395,29 +401,29 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual(sp.encode([text], out_type='serialized_proto'), [sprotos]) + self.assertEqual(sp.encode([text], out_type='immutable_proto'), [iprotos]) + +- self.assertEqual(len(iprotos), len(pieces)) +- self.assertEqual(len(iprotos), len(ids)) +- self.assertEqual(iprotos.text(), text) ++ self.assertEqual(len(iprotos.pieces), len(pieces)) ++ self.assertEqual(len(iprotos.pieces), len(ids)) ++ self.assertEqual(iprotos.text, text) + +- self.assertEqual(len(iprotos2), len(pieces2)) +- self.assertEqual(len(iprotos2), len(ids2)) +- self.assertEqual(iprotos2.text(), text2) ++ self.assertEqual(len(iprotos2.pieces), len(pieces2)) ++ self.assertEqual(len(iprotos2.pieces), len(ids2)) ++ self.assertEqual(iprotos2.text, text2) + +- for i in range(len(iprotos)): +- self.assertEqual(ids[i], iprotos.pieces(i).id()) +- self.assertEqual(pieces[i], iprotos.pieces(i).piece()) ++ for i in range(len(iprotos.pieces)): ++ self.assertEqual(ids[i], iprotos.pieces[i].id) ++ self.assertEqual(pieces[i], iprotos.pieces[i].piece) + +- for i, piece in enumerate(iprotos): +- self.assertEqual(ids[i], piece.id()) +- self.assertEqual(pieces[i], piece.piece()) ++ for i, piece in enumerate(iprotos.pieces): ++ self.assertEqual(ids[i], piece.id) ++ self.assertEqual(pieces[i], piece.piece) + +- for i in range(len(iprotos2)): +- self.assertEqual(ids2[i], iprotos2.pieces(i).id()) +- self.assertEqual(pieces2[i], iprotos2.pieces(i).piece()) ++ for i in range(len(iprotos2.pieces)): ++ self.assertEqual(ids2[i], iprotos2.pieces[i].id) ++ self.assertEqual(pieces2[i], iprotos2.pieces[i].piece) + +- for i, piece in enumerate(iprotos2): +- self.assertEqual(ids2[i], piece.id()) +- self.assertEqual(pieces2[i], piece.piece()) ++ for i, piece in enumerate(iprotos2.pieces): ++ self.assertEqual(ids2[i], piece.id) ++ self.assertEqual(pieces2[i], piece.piece) + + detok_ids = self.sp_.DecodeIds(ids) + detok_pieces = self.sp_.DecodePieces(pieces) diff --git a/patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch b/patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch new file mode 100644 index 0000000..181562f --- /dev/null +++ b/patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch @@ -0,0 +1,252 @@ +From: Taku Kudo +Date: Fri, 5 Aug 2022 14:47:02 +0900 +Subject: automatically detect the number of CPUs in batch processing. + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/__init__.py | 27 +++++++++++++-------- + python/src/sentencepiece/sentencepiece.i | 32 ++++++++++++++++--------- + python/src/sentencepiece/sentencepiece_wrap.cxx | 5 +++- + python/test/sentencepiece_test.py | 32 +++++++++++++++++++++++++ + 4 files changed, 74 insertions(+), 22 deletions(-) + +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index 12dc631..ce9d60d 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -97,6 +97,13 @@ class ImmutableSentencePieceText_ImmutableSentencePiece(object): + 'begin: {}\n' + 'end: {}\n').format(self.piece, self.id, self.surface, + self.begin, self.end) ++ ++ def __eq__(self, other): ++ return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end ++ ++ def __hash__(self): ++ return hash(str(self)) ++ + __repr__ = __str__ + + +@@ -395,7 +402,7 @@ class SentencePieceProcessor(object): + enable_sampling=False, + nbest_size=-1, + alpha=0.1, +- num_threads=1): ++ num_threads=-1): + """Initialzie sentencepieceProcessor. + + Args: +@@ -407,15 +414,15 @@ class SentencePieceProcessor(object): + reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) +- nbest_size: sampling parameters for unigram. Invalid for BPE-Dropout. ++ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite and samples + from the all hypothesis (lattice) using + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and dropout probability of +- merge operations for BPE-dropout. +- num_threads: number of threads in batch processing. ++ merge operations for BPE-dropout. ++ num_threads: number of threads in batch processing (Default = -1, auto-detected) + """ + + _sentencepiece_processor_init_native(self) +@@ -450,18 +457,18 @@ class SentencePieceProcessor(object): + out_type: output type. int or str. + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after +- reversing (if enabled). ++ reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) +- nbest_size: sampling parameters for unigram. Invalid for BPE-Dropout. ++ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite and samples +- from the all hypothesis (lattice) using +- forward-filtering-and-backward-sampling algorithm. ++ from the all hypothesis (lattice) using ++ forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and merge probability for + BPE-dropout (probablity 'p' in BPE-dropout paper). +- num_threads: the number of threads used in the batch processin (Default = 1). ++ num_threads: the number of threads used in the batch processing (Default = -1). + """ + + if out_type is None: +@@ -722,7 +729,7 @@ class SentencePieceProcessor(object): + + Args: + out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) +- num_threads: the number of threads used in the batch processin (Default = 1). ++ num_threads: the number of threads used in the batch processing (Default = -1). + """ + + if num_threads is None: +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 8309fc2..e22f763 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -233,9 +233,12 @@ class ThreadPool { + + template + inline void InitNumThreads(const std::vector &ins, int *num_threads) { ++ if (*num_threads < 0) { ++ *num_threads = std::thread::hardware_concurrency(); ++ } + *num_threads = std::max(1, + std::min({*num_threads, +- static_cast(ins.size()), 256})); ++ static_cast(ins.size()), 256})); + } + + #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ +@@ -675,7 +678,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + enable_sampling=False, + nbest_size=-1, + alpha=0.1, +- num_threads=1): ++ num_threads=-1): + """Initialzie sentencepieceProcessor. + + Args: +@@ -687,15 +690,15 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) +- nbest_size: sampling parameters for unigram. Invalid for BPE-Dropout. ++ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite and samples + from the all hypothesis (lattice) using + forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and dropout probability of +- merge operations for BPE-dropout. +- num_threads: number of threads in batch processing. ++ merge operations for BPE-dropout. ++ num_threads: number of threads in batch processing (Default = -1, auto-detected) + """ + + _sentencepiece_processor_init_native(self) +@@ -730,18 +733,18 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + out_type: output type. int or str. + add_bos: Add to the result (Default = false) + add_eos: Add to the result (Default = false) / is added after +- reversing (if enabled). ++ reversing (if enabled). + reverse: Reverses the tokenized sequence (Default = false) + emit_unk_piece: Emits the unk literal string (Default = false) +- nbest_size: sampling parameters for unigram. Invalid for BPE-Dropout. ++ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite and samples +- from the all hypothesis (lattice) using +- forward-filtering-and-backward-sampling algorithm. ++ from the all hypothesis (lattice) using ++ forward-filtering-and-backward-sampling algorithm. + alpha: Soothing parameter for unigram sampling, and merge probability for + BPE-dropout (probablity 'p' in BPE-dropout paper). +- num_threads: the number of threads used in the batch processin (Default = 1). ++ num_threads: the number of threads used in the batch processing (Default = -1). + """ + + if out_type is None: +@@ -1002,7 +1005,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + + Args: + out_type: output type. str or 'serialized_proto' or 'immutable_proto' (Default = str) +- num_threads: the number of threads used in the batch processin (Default = 1). ++ num_threads: the number of threads used in the batch processing (Default = -1). + """ + + if num_threads is None: +@@ -1260,6 +1263,13 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + 'begin: {}\n' + 'end: {}\n').format(self.piece, self.id, self.surface, + self.begin, self.end) ++ ++ def __eq__(self, other): ++ return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end ++ ++ def __hash__(self): ++ return hash(str(self)) ++ + __repr__ = __str__ + %} + } +diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx +index 0a8df5f..1eac211 100644 +--- a/python/src/sentencepiece/sentencepiece_wrap.cxx ++++ b/python/src/sentencepiece/sentencepiece_wrap.cxx +@@ -3042,9 +3042,12 @@ class ThreadPool { + + template + inline void InitNumThreads(const std::vector &ins, int *num_threads) { ++ if (*num_threads < 0) { ++ *num_threads = std::thread::hardware_concurrency(); ++ } + *num_threads = std::max(1, + std::min({*num_threads, +- static_cast(ins.size()), 256})); ++ static_cast(ins.size()), 256})); + } + + #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index ed792bd..6cbe077 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -332,6 +332,29 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual(s4, y4) + self.assertEqual(s5, y5) + ++ hset_piece = defaultdict(int) ++ ++ # eq test ++ for i in range(len(s1.pieces)): ++ self.assertEqual(s1.pieces[i], t1.pieces[i]) ++ hset_piece[s1.pieces[i]] += 1 ++ hset_piece[t1.pieces[i]] += 1 ++ ++ self.assertEqual(len(hset_piece), len(s1.pieces)) ++ ++ # has test ++ hset = defaultdict(int) ++ hset[s1] += 1 ++ hset[t1] += 1 ++ hset[s3] += 1 ++ hset[t3] += 1 ++ ++ self.assertEqual(len(hset), 2) ++ self.assertEqual(hset[s1], 2) ++ self.assertEqual(hset[s3], 2) ++ self.assertEqual(hset[t1], 2) ++ self.assertEqual(hset[t3], 2) ++ + x1 = self.sp_.encode_as_serialized_proto(text) + x2 = self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2) + x3 = self.sp_.nbest_encode_as_serialized_proto(text, 10) +@@ -363,6 +386,15 @@ class TestSentencepieceProcessor(unittest.TestCase): + pieces.append(s1.pieces[i].piece) + self.assertEqual(pieces, v2) + ++ for v in s3.nbests: ++ self.assertEqual(text, v.text) ++ self.assertEqual(self.sp_.Decode([x.id for x in v.pieces]), text) ++ ++ for i in range(len(s3.nbests)): ++ self.assertEqual(text, s3.nbests[i].text) ++ self.assertEqual( ++ self.sp_.Decode([x.id for x in s3.nbests[i].pieces]), text) ++ + # Japanese offset + s1 = self.jasp_.EncodeAsImmutableProto('吾輩は猫である。Hello world. ABC 123') + surfaces1 = [s1.text[x.begin:x.end] for x in s1.pieces] diff --git a/patches/0019-support-slice-in-pieces-nbests-objects.patch b/patches/0019-support-slice-in-pieces-nbests-objects.patch new file mode 100644 index 0000000..60cc305 --- /dev/null +++ b/patches/0019-support-slice-in-pieces-nbests-objects.patch @@ -0,0 +1,78 @@ +From: Taku Kudo +Date: Fri, 5 Aug 2022 16:34:44 +0900 +Subject: support slice in pieces/nbests objects + +Signed-off-by: Kentaro Hayashi +--- + python/src/sentencepiece/__init__.py | 8 ++++++++ + python/src/sentencepiece/sentencepiece.i | 8 ++++++++ + python/test/sentencepiece_test.py | 4 ++++ + 3 files changed, 20 insertions(+) + +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index ce9d60d..cf06830 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -145,6 +145,10 @@ class ImmutableSentencePieceText(object): + return self.len + + def __getitem__(self, index): ++ if isinstance(index, slice): ++ return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step] ++ if index < 0: ++ index = index + self.len + if index < 0 or index >= self.len: + raise IndexError('piece index is out of range') + return self.proto._pieces(index) +@@ -202,6 +206,10 @@ class ImmutableNBestSentencePieceText(object): + return self.len + + def __getitem__(self, index): ++ if isinstance(index, slice): ++ return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step] ++ if index < 0: ++ index = index + self.len + if index < 0 or index >= self.len: + raise IndexError('nbests index is out of range') + return self.proto._nbests(index) +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index e22f763..2ac68a8 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -1293,6 +1293,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return self.len + + def __getitem__(self, index): ++ if isinstance(index, slice): ++ return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step] ++ if index < 0: ++ index = index + self.len + if index < 0 or index >= self.len: + raise IndexError('piece index is out of range') + return self.proto._pieces(index) +@@ -1336,6 +1340,10 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return self.len + + def __getitem__(self, index): ++ if isinstance(index, slice): ++ return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step] ++ if index < 0: ++ index = index + self.len + if index < 0 or index >= self.len: + raise IndexError('nbests index is out of range') + return self.proto._nbests(index) +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 6cbe077..92327ac 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -395,6 +395,10 @@ class TestSentencepieceProcessor(unittest.TestCase): + self.assertEqual( + self.sp_.Decode([x.id for x in s3.nbests[i].pieces]), text) + ++ # slice ++ self.assertEqual(s1.pieces[::-1], list(reversed(s1.pieces))) ++ self.assertEqual(s3.nbests[::-1], list(reversed(s3.nbests))) ++ + # Japanese offset + s1 = self.jasp_.EncodeAsImmutableProto('吾輩は猫である。Hello world. ABC 123') + surfaces1 = [s1.text[x.begin:x.end] for x in s1.pieces] diff --git a/patches/0020-Updated-the-document.patch b/patches/0020-Updated-the-document.patch new file mode 100644 index 0000000..c7a12f8 --- /dev/null +++ b/patches/0020-Updated-the-document.patch @@ -0,0 +1,546 @@ +From: Taku Kudo +Date: Fri, 5 Aug 2022 19:05:52 +0900 +Subject: Updated the document + +Signed-off-by: Kentaro Hayashi +--- + README.md | 1 - + doc/api.md | 22 ++-- + doc/options.md | 102 ++++++++++--------- + python/README.md | 168 +++++++++++++------------------ + python/src/sentencepiece/__init__.py | 22 +++- + python/src/sentencepiece/sentencepiece.i | 22 +++- + python/test/sentencepiece_test.py | 20 +++- + 7 files changed, 199 insertions(+), 158 deletions(-) + +diff --git a/README.md b/README.md +index dc71b64..1986047 100644 +--- a/README.md ++++ b/README.md +@@ -276,6 +276,5 @@ Then segment train/test corpus with ```--vocabulary``` option + * [Use custom text normalization rules](doc/normalization.md) + * [Use custom symbols](doc/special_symbols.md) + * [Python Module](python/README.md) +-* [TensorFlow Module](tensorflow/README.md) + * [Segmentation and training algorithms in detail] + +diff --git a/doc/api.md b/doc/api.md +index 797074c..ebde880 100644 +--- a/doc/api.md ++++ b/doc/api.md +@@ -14,9 +14,9 @@ if (!status.ok()) { + // error + } + +-// You can also load a model from std::ifstream. +-// std::ifstream in("//path/to/model.model"); +-// auto status = processor.Load(in); ++// You can also load a serialized model from std::string. ++// const std::stirng str = // Load blob contents from a file. ++// auto status = processor.LoadFromSerializedProto(str); + ``` + + ## Tokenize text (preprocessing) +@@ -75,16 +75,20 @@ Calls `SentencePieceTrainer::Train` function to train sentencepiece model. You c + sentencepiece::SentencePieceTrainer::Train("--input=test/botchan.txt --model_prefix=m --vocab_size=1000"); + ``` + +-## SentencePieceText proto +-You will want to use `SentencePieceText` class to obtain the pieces and ids at the same time. This proto also encodes a utf8-byte offset of each piece over user input or detokenized text. ++## ImmutableSentencePieceText ++You will want to use `ImmutableSentencePieceText` class to obtain the pieces and ids at the same time. ++This proto also encodes a utf8-byte offset of each piece over user input or detokenized text. + + ```C++ +-#include ++#include + +-sentencepiece::SentencePieceText spt; ++sentencepiece::ImmutableSentencePieceText spt; + + // Encode +-processor.Encode("This is a test.", &spt); ++processor.Encode("This is a test.", spt.mutable_proto()); ++ ++// or ++// spt = processor.EncodeAsImmutableProto("This is a test."); + + std::cout << spt.text() << std::endl; // This is the same as the input. + for (const auto &piece : spt.pieces()) { +@@ -96,7 +100,7 @@ for (const auto &piece : spt.pieces()) { + } + + // Decode +-processor.Decode({10, 20, 30}, &spt); ++processor.Decode({10, 20, 30}, spt.mutable_proto()); + std::cout << spt.text() << std::endl; // This is the same as the decoded string. + for (const auto &piece : spt.pieces()) { + // the same as above. +diff --git a/doc/options.md b/doc/options.md +index 26cf681..6cdc0f9 100644 +--- a/doc/options.md ++++ b/doc/options.md +@@ -3,50 +3,60 @@ + The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here. + + ``` +---help (show help) type: bool default: false +---version (show version) type: bool default: false +---minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 +---input (comma separated list of input sentences) type: std::string default: "" +---input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" +---model_prefix (output model prefix) type: std::string default: "" +---model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" +---vocab_size (vocabulary size) type: int32 default: 8000 +---accept_language (comma-separated list of languages this model can accept) type: std::string default: "" +---self_test_sample_size (the size of self test samples) type: int32 default: 0 +---character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 +---input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 +---shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true +---seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 +---shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 +---num_threads (number of threads for training) type: int32 default: 16 +---num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 +---max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 +---max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 +---split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true +---split_by_number (split tokens by numbers (0-9)) type: bool default: true +---split_by_whitespace (use a white space to split sentence pieces) type: bool default: true +---split_digits (split all digits (0-9) into separate pieces) type: bool default: false +---treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false +---control_symbols (comma separated list of control symbols) type: std::string default: "" +---user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" +---required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" +---byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false +---vocabulary_output_piece_score (Define score in vocab file) type: bool default: true +---normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" +---normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" +---denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" +---add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true +---remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true +---hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true +---use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false +---unk_id (Override UNK () id.) type: int32 default: 0 +---bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 +---eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 +---pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 +---unk_piece (Override UNK () piece.) type: std::string default: "" +---bos_piece (Override BOS () piece.) type: std::string default: "" +---eos_piece (Override EOS () piece.) type: std::string default: "" +---pad_piece (Override PAD () piece.) type: std::string default: "" +---unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " +---train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false ++Usage: ../build/src/spm_train [options] files ++ ++ --input (comma separated list of input sentences) type: std::string default: "" ++ --input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" ++ --model_prefix (output model prefix) type: std::string default: "" ++ --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" ++ --vocab_size (vocabulary size) type: int32 default: 8000 ++ --accept_language (comma-separated list of languages this model can accept) type: std::string default: "" ++ --self_test_sample_size (the size of self test samples) type: int32 default: 0 ++ --character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 ++ --input_sentence_size (maximum size of sentences the trainer loads) type: std::uint64_t default: 0 ++ --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true ++ --seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 ++ --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 ++ --num_threads (number of threads for training) type: int32 default: 16 ++ --num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 ++ --max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 ++ --max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 ++ --split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true ++ --split_by_number (split tokens by numbers (0-9)) type: bool default: true ++ --split_by_whitespace (use a white space to split sentence pieces) type: bool default: true ++ --split_digits (split all digits (0-9) into separate pieces) type: bool default: false ++ --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false ++ --allow_whitespace_only_pieces (allow pieces that only contain (consecutive) whitespace tokens) type: bool default: false ++ --control_symbols (comma separated list of control symbols) type: std::string default: "" ++ --control_symbols_file (load control_symbols from file.) type: std::string default: "" ++ --user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" ++ --user_defined_symbols_file (load user_defined_symbols from file.) type: std::string default: "" ++ --required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" ++ --required_chars_file (load required_chars from file.) type: std::string default: "" ++ --byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false ++ --vocabulary_output_piece_score (Define score in vocab file) type: bool default: true ++ --normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" ++ --normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" ++ --denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" ++ --add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true ++ --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true ++ --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true ++ --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false ++ --unk_id (Override UNK () id.) type: int32 default: 0 ++ --bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 ++ --eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 ++ --pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 ++ --unk_piece (Override UNK () piece.) type: std::string default: "" ++ --bos_piece (Override BOS () piece.) type: std::string default: "" ++ --eos_piece (Override EOS () piece.) type: std::string default: "" ++ --pad_piece (Override PAD () piece.) type: std::string default: "" ++ --unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " ++ --train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false ++ --random_seed (Seed value for random generator.) type: uint32 default: 4294967295 ++ --enable_differential_privacy (Whether to add DP while training. Currently supported only by UNIGRAM model.) type: bool default: false ++ --differential_privacy_noise_level (Amount of noise to add for DP) type: float default: 0 ++ --differential_privacy_clipping_threshold (Threshold for clipping the counts for DP) type: std::uint64_t default: 0 ++ --help (show help) type: bool default: false ++ --version (show version) type: bool default: false ++ --minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 + ``` +diff --git a/python/README.md b/python/README.md +index b683082..bc5a59a 100644 +--- a/python/README.md ++++ b/python/README.md +@@ -9,10 +9,17 @@ For Linux (x64/i686), macOS, and Windows(win32/x64) environment, you can simply + % pip install sentencepiece + ``` + +-To build and install the Python wrapper from source, please install [SentencePiece C++](https://github.com/google/sentencepiece#c-from-source) and try the following commands: ++To build and install the Python wrapper from source, try the following commands to build and install wheel package. + ``` +-% python setup.py build +-% sudo python setup.py install ++% git clone https://github.com/google/sentencepiece.git ++% cd sentencepiece ++% mkdir build ++% cd build ++% cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root ++% make install ++% cd ../python ++% python setup.py bdist_wheel ++% pip install dist/sentencepiece*.whl + ``` + + If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try: +@@ -22,21 +29,50 @@ If you don’t have write permission to the global site-packages directory or do + + ## Usage + +-See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively. (Note: this sample is written in old interface.) ++See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively. + + ### Segmentation + ``` + % python + >>> import sentencepiece as spm + >>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model') ++ + >>> sp.encode('This is a test') + [284, 47, 11, 4, 15, 400] ++ + >>> sp.encode(['This is a test', 'Hello world'], out_type=int) + [[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]] ++ ++>>> sp.encode_as_ids(['This is a test', 'Hello world']) ++[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]] ++ + >>> sp.encode('This is a test', out_type=str) + ['▁This', '▁is', '▁a', '▁', 't', 'est'] ++ + >>> sp.encode(['This is a test', 'Hello world'], out_type=str) + [['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']] ++ ++>>> sp.encode_as_pieces(['This is a test', 'Hello world']) ++[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']] ++ ++>>> proto = sp.encode('This is a test', out_type='immutable_proto') ++>>> for n in proto.pieces: ++... print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end)) ++... ++piece="▁This" surface="This" id=284 begin=0 end=4 ++piece="▁is" surface=" is" id=47 begin=4 end=7 ++piece="▁a" surface=" a" id=11 begin=7 end=9 ++piece="▁" surface=" " id=4 begin=9 end=10 ++piece="t" surface="t" id=15 begin=10 end=11 ++piece="est" surface="est" id=400 begin=11 end=14 ++ ++>>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]] ++[[284, 47, 11, 4, 15, 400], ['▁This', '▁is', '▁a', '▁', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]] ++ ++>>> proto2 = sp.encode_as_immutable_proto('This is a test') ++>>> proto2 == proto ++True ++ + >>> for _ in range(10): + ... sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1) + ... +@@ -50,26 +86,55 @@ See [this google colab page](https://github.com/google/sentencepiece/blob/master + ['▁', 'T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 'te', 'st'] + ['▁', 'This', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st'] + ['▁This', '▁', 'is', '▁a', '▁', 't', 'est'] ++ ++>> sp.nbest_encode('This is a test', nbest_size=5, out_type=str) ++[['▁This', '▁is', '▁a', '▁', 't', 'est'], ++['▁This', '▁is', '▁a', '▁', 'te', 'st'], ++['▁This', '▁is', '▁a', '▁', 'te', 's', 't'], ++['▁This', '▁is', '▁a', '▁', 't', 'e', 'st'], ++['▁This', '▁is', '▁a', '▁', 't', 'es', 't']] ++ ++>>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True) ++[(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't'], -3.043105125427246), ++(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 'st'], -2.8475849628448486), ++(['▁', 'This', '▁is', '▁', 'a', '▁', 'te', 'st'], -3.043248176574707), ++(['▁', 'This', '▁is', '▁a', '▁', 't', 'e', 'st'], -2.87727689743042), ++(['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'est'], -3.6284031867980957)] ++ + >>> sp.decode([284, 47, 11, 4, 15, 400]) + 'This is a test' ++ + >>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]) + ['This is a test', 'Hello world'] ++ ++>>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto') ++>>> proto.text ++'This is a test' ++ + >>> sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st']) + 'This is a test' ++ + >>> sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]) + ['This is a test', 'Hello world'] ++ + >>> sp.get_piece_size() + 1000 ++ + >>> sp.id_to_piece(2) + '' ++ + >>> sp.id_to_piece([2, 3, 4]) + ['', '\r', '▁'] ++ + >>> sp.piece_to_id('') + 1 ++ + >>> sp.piece_to_id(['', '\r', '▁']) + [2, 3, 4] ++ + >>> len(sp) + 1000 ++ + >>> sp[''] + 2 + ``` +@@ -116,98 +181,3 @@ with urllib.request.urlopen( + sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) + print(sp.encode('this is test')) + ``` +- +- +-### Segmentation (old interface) +-``` +-% python +->>> import sentencepiece as spm +->>> sp = spm.SentencePieceProcessor() +->>> sp.Load("test/test_model.model") +-True +->>> sp.EncodeAsPieces("This is a test") +-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'] +->>> sp.EncodeAsIds("This is a test") +-[284, 47, 11, 4, 15, 400] +->>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']) +-'This is a test' +->>> sp.NBestEncodeAsPieces("This is a test", 5) +-[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']] +->>> for x in range(10): +-... sp.SampleEncodeAsPieces("This is a test", -1, 0.1) +-... +-['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] +-['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est'] +-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] +-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] +-['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] +-['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] +-['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] +-['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] +-['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] +-['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't'] +->>> sp.DecodeIds([284, 47, 11, 4, 15, 400]) +-'This is a test' +->>> sp.GetPieceSize() +-1000 +->>> sp.IdToPiece(2) +-'' +->>> sp.PieceToId('') +-2 +->>> len(sp) +-1000 +->>> sp[''] +-2 +-``` +- +-### Model Training (old interface) +-Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.Train() function. +- +-``` +->>> import sentencepiece as spm +->>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000') +-unigram_model_trainer.cc(494) LOG(INFO) Starts training with : +-input: "test/botchan.txt" +-model_prefix: "m" +-model_type: UNIGRAM +-..snip.. +-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623 +-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623 +-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118 +-unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091 +-trainer_interface.cc(284) LOG(INFO) Saving model: m.model +-trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab +->>> +-``` +- +-## Python2/3 String/Unicode compatibility +-Sentencepiece python wrapper accepts both Unicode string and legacy byte string. +-The output string type is determined by the input string type. +-The output type of IdToPiece/DecodeIds methods is *str*, but note that it is a legacy byte string in Python2 and Unicode string in Python3 respectively. +- +-* Python2: +-``` +->>> sp.EncodeAsPieces('吾輩は猫である') +-['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] +->>> sp.EncodeAsPieces(u'吾輩は猫である') +-[u'\u2581', u'\u543e', u'\u8f29', u'\u306f', u'\u732b', u'\u3067\u3042\u308b'] +->>> sp.EncodeAsPieces(u'吾輩は猫である'.encode('utf-8')) +-['\xe2\x96\x81', '\xe5\x90\xbe', '\xe8\xbc\xa9', '\xe3\x81\xaf', '\xe7\x8c\xab', '\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] +->>> sp.IdToPiece(10) +-'\xe3\x81\xab' +->>> type(sp.IdToPiece(10)) +- +-``` +- +-* Python3: +-``` +->>> sp.EncodeAsPieces('吾輩は猫である') +-['▁', '吾', '輩', 'は', '猫', 'である'] +->>> sp.EncodeAsPieces('吾輩は猫である'.encode('utf-8')) +-[b'\xe2\x96\x81', b'\xe5\x90\xbe', b'\xe8\xbc\xa9', b'\xe3\x81\xaf', b'\xe7\x8c\xab', b'\xe3\x81\xa7\xe3\x81\x82\xe3\x82\x8b'] +->>> +->>> sp.IdToPiece(10) +-'に' +->>> type(sp.IdToPiece(10)) +- +-``` +diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py +index cf06830..911a2cb 100644 +--- a/python/src/sentencepiece/__init__.py ++++ b/python/src/sentencepiece/__init__.py +@@ -635,7 +635,7 @@ class SentencePieceProcessor(object): + return _encode(input) + + +- def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): ++ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=str, **kwargs) + +@@ -732,6 +732,26 @@ class SentencePieceProcessor(object): + return _encode(input) + + ++ def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type=str, **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type=int, **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type='serialized_proto', **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type='immutable_proto', **kwargs) ++ ++ + def Decode(self, input, out_type=str, num_threads=None): + """Decode processed id or token sequences. + +diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i +index 2ac68a8..fc773e2 100644 +--- a/python/src/sentencepiece/sentencepiece.i ++++ b/python/src/sentencepiece/sentencepiece.i +@@ -903,7 +903,7 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return _encode(input) + + +- def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): ++ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=str, **kwargs) + +@@ -1000,6 +1000,26 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { + return _encode(input) + + ++ def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type=str, **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type=int, **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type='serialized_proto', **kwargs) ++ ++ ++ def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs): ++ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha, ++ out_type='immutable_proto', **kwargs) ++ ++ + def Decode(self, input, out_type=str, num_threads=None): + """Decode processed id or token sequences. + +diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py +index 92327ac..2b9ad28 100755 +--- a/python/test/sentencepiece_test.py ++++ b/python/test/sentencepiece_test.py +@@ -566,7 +566,7 @@ class TestSentencepieceProcessor(unittest.TestCase): + for n in sp.decode(results): + self.assertEqual(n, text) + +- # batch test ++ # batch test + results = sp.nbest_encode([text, text2], nbest_size=10, out_type=out_type) + self.assertEqual( + results, +@@ -589,6 +589,19 @@ class TestSentencepieceProcessor(unittest.TestCase): + for n in decoded: + self.assertEqual(n, text2) + ++ self.assertEqual( ++ sp.nbest_encode(text, nbest_size=10, out_type=str), ++ sp.nbest_encode_as_pieces(text, nbest_size=10)) ++ self.assertEqual( ++ sp.nbest_encode(text, nbest_size=10, out_type=int), ++ sp.nbest_encode_as_ids(text, nbest_size=10)) ++ self.assertEqual( ++ sp.nbest_encode(text, nbest_size=10, out_type='serialized_proto'), ++ sp.nbest_encode_as_serialized_proto(text, nbest_size=10)) ++ self.assertEqual( ++ sp.nbest_encode(text, nbest_size=10, out_type='immutable_proto'), ++ sp.nbest_encode_as_immutable_proto(text, nbest_size=10)) ++ + def test_sample_and_score(self): + sp = self.sp_ + text = 'hello world' +@@ -618,6 +631,11 @@ class TestSentencepieceProcessor(unittest.TestCase): + for n in results[1]: + self.assertEqual(sp.decode(n[0]), text2) + ++ sp.sample_encode_and_score_as_pieces(text, 10) ++ sp.sample_encode_and_score_as_ids(text, 10) ++ sp.sample_encode_and_score_as_immutable_proto(text, 10) ++ sp.sample_encode_and_score_as_serialized_proto(text, 10) ++ + def test_valid_range(self): + size = self.sp_.piece_size() + funcs = [ diff --git a/patches/0021-Fixed-errors-in-example-notebook.patch b/patches/0021-Fixed-errors-in-example-notebook.patch new file mode 100644 index 0000000..b6f4b02 --- /dev/null +++ b/patches/0021-Fixed-errors-in-example-notebook.patch @@ -0,0 +1,158 @@ +From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com> +Date: Tue, 9 Aug 2022 15:15:30 +0300 +Subject: Fixed errors in example notebook + +Signed-off-by: Kentaro Hayashi +--- + python/sentencepiece_python_module_example.ipynb | 44 ++++++++++-------------- + 1 file changed, 19 insertions(+), 25 deletions(-) + +diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb +index 78464d1..1eb0f9c 100644 +--- a/python/sentencepiece_python_module_example.ipynb ++++ b/python/sentencepiece_python_module_example.ipynb +@@ -216,7 +216,7 @@ + "import tensorflow as tf\n", + "\n", + "# Assumes that m.model is stored in non-Posix file system.\n", +- "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n", ++ "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n", + "\n", + "sp = spm.SentencePieceProcessor()\n", + "sp.load_from_serialized_proto(serialized_model_proto)\n", +@@ -265,7 +265,7 @@ + }, + "cell_type": "code", + "source": [ +- "## Example of user defined symbols\n", ++ "# Example of user defined symbols\n", + "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=, --vocab_size=2000')\n", + "\n", + "sp_user = spm.SentencePieceProcessor()\n", +@@ -307,7 +307,7 @@ + }, + "cell_type": "code", + "source": [ +- "## Example of control symbols\n", ++ "# Example of control symbols\n", + "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=, --vocab_size=2000')\n", + "\n", + "sp_ctrl = spm.SentencePieceProcessor()\n", +@@ -564,7 +564,7 @@ + "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n", + "sp = spm.SentencePieceProcessor()\n", + "sp.load('m.model')\n", +- "print(sp.decode_ids([sp.unk_id()])) " ++ "print(sp.decode_ids([sp.unk_id()]))" + ], + "execution_count": 0, + "outputs": [ +@@ -608,7 +608,7 @@ + "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n", + "for n in range(10):\n", + " print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n", +- " \n", ++ "\n", + "for n in range(10):\n", + " print(sp.sample_encode_as_ids('hello world', -1, 0.1))" + ], +@@ -858,8 +858,6 @@ + }, + "cell_type": "code", + "source": [ +- "import sentencepiece as spm\n", +- "\n", + "# NFKC normalization and lower casing.\n", + "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n", + "\n", +@@ -903,11 +901,12 @@ + }, + "cell_type": "code", + "source": [ +- "def tocode(s): \n", +- " out = [] \n", +- " for c in s: \n", +- " out.append(str(hex(ord(c))).replace('0x', 'U+')) \n", +- " return ' '.join(out) \n", ++ "def tocode(s):\n", ++ " out = []\n", ++ " for c in s:\n", ++ " out.append(str(hex(ord(c))).replace('0x', 'U+'))\n", ++ " return ' '.join(out)\n", ++ "\n", + "\n", + "# TSV format: source Unicode code points target code points\n", + "# normalize \"don't => do not, I'm => I am\"\n", +@@ -923,7 +922,7 @@ + "# m.model embeds the normalization rule compiled into an FST.\n", + "sp.load('m.model')\n", + "print(sp.encode_as_pieces(\"I'm busy\")) # normalzied to `I am busy'\n", +- "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'\n" ++ "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'" + ], + "execution_count": 0, + "outputs": [ +@@ -1029,9 +1028,9 @@ + " for piece in sp.encode_as_pieces(line):\n", + " freq.setdefault(piece, 0)\n", + " freq[piece] += 1\n", +- " \n", ++ "\n", + "# only uses the token appearing more than 1000 times in the training data.\n", +- "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n", ++ "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n", + "sp.set_vocabulary(vocabs)\n", + "print(sp.encode_as_pieces('this is a test.'))\n", + "\n", +@@ -1133,20 +1132,17 @@ + }, + "cell_type": "code", + "source": [ +- "freq={}\n", ++ "freq = {}\n", + "with open('botchan.txt', 'r') as f:\n", + " for line in f:\n", + " line = line.rstrip()\n", + " for piece in line.split():\n", + " freq.setdefault(piece, 0)\n", + " freq[piece] += 1\n", +- " \n", ++ "\n", + "with open('word_freq_list.tsv', 'w') as f:\n", + " for k, v in freq.items():\n", + " f.write('%s\\t%d\\n' % (k, v))\n", +- " \n", +- "\n", +- "import sentencepiece as spm\n", + "\n", + "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n", + "sp = spm.SentencePieceProcessor()\n", +@@ -1176,7 +1172,7 @@ + "\n", + "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n", + "\n", +- "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n", ++ "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n", + "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n", + "\n", + "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n" +@@ -1194,8 +1190,7 @@ + }, + "cell_type": "code", + "source": [ +- "!pip install protobuf\n", +- "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py" ++ "!pip install protobuf" + ], + "execution_count": 0, + "outputs": [ +@@ -1233,8 +1228,7 @@ + }, + "cell_type": "code", + "source": [ +- "import sentencepiece_pb2\n", +- "import sentencepiece as spm\n", ++ "from sentencepiece import sentencepiece_pb2\n", + "\n", + "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n", + "\n", diff --git a/patches/0022-Fix-dead-links.patch b/patches/0022-Fix-dead-links.patch new file mode 100644 index 0000000..914155d --- /dev/null +++ b/patches/0022-Fix-dead-links.patch @@ -0,0 +1,36 @@ +From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com> +Date: Tue, 9 Aug 2022 15:15:51 +0300 +Subject: Fix dead links + +Signed-off-by: Kentaro Hayashi +--- + README.md | 2 +- + doc/experiments.md | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/README.md b/README.md +index 1986047..84e853e 100644 +--- a/README.md ++++ b/README.md +@@ -36,7 +36,7 @@ For those unfamiliar with SentencePiece as a software/algorithm, one can read [a + |:---|:---:|:---:|:---:| + |Supported algorithm|BPE, unigram, char, word|BPE|BPE*| + |OSS?|Yes|Yes|Google internal| +-|Subword regularization|[Yes](#subword-regularization)|No|No| ++|Subword regularization|[Yes](#subword-regularization-and-bpe-dropout)|No|No| + |Python Library (pip)|[Yes](python/README.md)|No|N/A| + |C++ Library|[Yes](doc/api.md)|No|N/A| + |Pre-segmentation required?|[No](#whitespace-is-treated-as-a-basic-symbol)|Yes|Yes| +diff --git a/doc/experiments.md b/doc/experiments.md +index 5a58cd1..e088152 100644 +--- a/doc/experiments.md ++++ b/doc/experiments.md +@@ -112,7 +112,7 @@ We have evaluated SentencePiece segmentation with the following configurations. + * [KFTT](http://www.phontron.com/kftt/index.html) + * [MultiUN](http://opus.lingfil.uu.se/MultiUN.php) (First 5M and next + 5k/5k sentences are used for training and development/testing respectively.) +- * [WMT16](http://www.statmt.org/WMT16/) ++ * [WMT16](https://www.statmt.org/wmt16/) + * In-house: (Used 5M parallel sentences for training) + + **NoPretok** and **WsPretok** do not use any language-dependent resources. diff --git a/patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch b/patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch new file mode 100644 index 0000000..bd6116d --- /dev/null +++ b/patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch @@ -0,0 +1,176 @@ +From: Taku Kudo +Date: Sat, 20 Aug 2022 23:34:37 +0900 +Subject: added ShutdownLibrary function to uninitialize global variables + +Signed-off-by: Kentaro Hayashi +--- + src/compile_charsmap_main.cc | 1 + + src/error.cc | 3 +++ + src/init.h | 15 +++++++++++++++ + src/spm_decode_main.cc | 1 + + src/spm_encode_main.cc | 1 + + src/spm_export_vocab_main.cc | 6 +++--- + src/spm_normalize_main.cc | 1 + + src/spm_train_main.cc | 1 + + src/test_main.cc | 1 + + 9 files changed, 27 insertions(+), 3 deletions(-) + +diff --git a/src/compile_charsmap_main.cc b/src/compile_charsmap_main.cc +index 13bf822..da15328 100644 +--- a/src/compile_charsmap_main.cc ++++ b/src/compile_charsmap_main.cc +@@ -156,6 +156,7 @@ struct BinaryBlob { + } // namespace sentencepiece + + int main(int argc, char **argv) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + + const std::vector + + #include "common.h" ++#include "init.h" + #include "sentencepiece_processor.h" + + #ifdef _USE_EXTERNAL_ABSL +@@ -35,6 +36,7 @@ void Abort() { + SetTestCounter(2); + } else { + std::cerr << "Program terminated with an unrecoverable error." << std::endl; ++ ShutdownLibrary(); + exit(-1); + } + } +@@ -43,6 +45,7 @@ void Exit(int code) { + if (GetTestCounter() == 1) { + SetTestCounter(2); + } else { ++ ShutdownLibrary(); + exit(code); + } + } +diff --git a/src/init.h b/src/init.h +index 090a2d9..7c75db2 100644 +--- a/src/init.h ++++ b/src/init.h +@@ -18,6 +18,7 @@ + #include "common.h" + #include "third_party/absl/flags/flag.h" + #include "third_party/absl/flags/parse.h" ++#include "third_party/protobuf-lite/google/protobuf/message_lite.h" + + ABSL_DECLARE_FLAG(int32, minloglevel); + +@@ -35,6 +36,20 @@ inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, + + logging::SetMinLogLevel(absl::GetFlag(FLAGS_minloglevel)); + } ++ ++inline void ShutdownLibrary() { ++ google::protobuf::ShutdownProtobufLibrary(); ++#ifdef HAS_ABSL_CLEANUP_FLAGS ++ absl::CleanupFlags(); ++#endif ++} ++ ++class ScopedResourceDestructor { ++ public: ++ ScopedResourceDestructor() {} ++ ~ScopedResourceDestructor() { ShutdownLibrary(); } ++}; ++ + } // namespace sentencepiece + + #endif // INIT_H_ +diff --git a/src/spm_decode_main.cc b/src/spm_decode_main.cc +index 3382ddc..bc49bd3 100644 +--- a/src/spm_decode_main.cc ++++ b/src/spm_decode_main.cc +@@ -34,6 +34,7 @@ ABSL_FLAG(std::string, extra_options, "", + "':' separated encoder extra options, e.g., \"reverse:bos:eos\""); + + int main(int argc, char *argv[]) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + std::vector rest_args; + +diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc +index b0e508d..2fbb850 100644 +--- a/src/spm_encode_main.cc ++++ b/src/spm_encode_main.cc +@@ -51,6 +51,7 @@ ABSL_FLAG(bool, generate_vocabulary, false, + "Generates vocabulary file instead of segmentation"); + + int main(int argc, char *argv[]) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + std::vector rest_args; + +diff --git a/src/spm_export_vocab_main.cc b/src/spm_export_vocab_main.cc +index b5d93cb..e5b97df 100644 +--- a/src/spm_export_vocab_main.cc ++++ b/src/spm_export_vocab_main.cc +@@ -1,11 +1,10 @@ +- +- + // Copyright 2016 Google Inc. + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at +-// n// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, +@@ -29,6 +28,7 @@ ABSL_FLAG(std::string, output_format, "vocab", + "and scores, syms outputs pieces and indices."); + + int main(int argc, char *argv[]) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + + sentencepiece::SentencePieceProcessor sp; +diff --git a/src/spm_normalize_main.cc b/src/spm_normalize_main.cc +index 96da360..39f3ef9 100644 +--- a/src/spm_normalize_main.cc ++++ b/src/spm_normalize_main.cc +@@ -46,6 +46,7 @@ using sentencepiece::normalizer::Builder; + using sentencepiece::normalizer::Normalizer; + + int main(int argc, char *argv[]) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + std::vector rest_args; + +diff --git a/src/spm_train_main.cc b/src/spm_train_main.cc +index c34ee02..6ab634d 100644 +--- a/src/spm_train_main.cc ++++ b/src/spm_train_main.cc +@@ -157,6 +157,7 @@ ABSL_FLAG(std::uint64_t, differential_privacy_clipping_threshold, 0, + " clipping the counts for DP"); + + int main(int argc, char *argv[]) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + + sentencepiece::TrainerSpec trainer_spec; +diff --git a/src/test_main.cc b/src/test_main.cc +index b3170e2..38c978d 100644 +--- a/src/test_main.cc ++++ b/src/test_main.cc +@@ -24,6 +24,7 @@ ABSL_FLAG(std::string, test_srcdir, "../data", "Data directory."); + ABSL_FLAG(std::string, test_tmpdir, "test_tmp", "Temporary directory."); + + int main(int argc, char **argv) { ++ sentencepiece::ScopedResourceDestructor cleaner; + sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + sentencepiece::test::RunAllTests(); + return 0; diff --git a/patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch b/patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch new file mode 100644 index 0000000..9a54c2e --- /dev/null +++ b/patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch @@ -0,0 +1,153 @@ +From: Taku Kudo +Date: Sun, 21 Aug 2022 12:44:31 +0900 +Subject: Fixed the issue of concatinating paths for pkg-config + +Signed-off-by: Kentaro Hayashi +--- + CMakeLists.txt | 24 ++++++++++++++++++++++++ + sentencepiece.pc.in | 4 ++-- + third_party/absl/flags/flag.cc | 20 +++++++++++++++----- + third_party/absl/flags/flag.h | 10 ++++++++-- + 4 files changed, 49 insertions(+), 9 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 78379a3..382103b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -94,6 +94,30 @@ if (NOT DEFINED CMAKE_INSTALL_INCDIR) + set(CMAKE_INSTALL_INCDIR include) + endif() + ++# SPDX-License-Identifier: (MIT OR CC0-1.0) ++# Copyright 2020 Jan Tojnar ++# https://github.com/jtojnar/cmake-snips ++# ++# Modelled after Python’s os.path.join ++# https://docs.python.org/3.7/library/os.path.html#os.path.join ++# Windows not supported ++function(join_paths joined_path first_path_segment) ++ set(temp_path "${first_path_segment}") ++ foreach(current_segment IN LISTS ARGN) ++ if(NOT ("${current_segment}" STREQUAL "")) ++ if(IS_ABSOLUTE "${current_segment}") ++ set(temp_path "${current_segment}") ++ else() ++ set(temp_path "${temp_path}/${current_segment}") ++ endif() ++ endif() ++ endforeach() ++ set(${joined_path} "${temp_path}" PARENT_SCOPE) ++endfunction() ++ ++join_paths(libdir_for_pc_file "\${exec_prefix}" "${CMAKE_INSTALL_LIBDIR}") ++join_paths(includedir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") ++ + configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") + configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) + +diff --git a/sentencepiece.pc.in b/sentencepiece.pc.in +index ac7fef6..6a5ba56 100644 +--- a/sentencepiece.pc.in ++++ b/sentencepiece.pc.in +@@ -1,7 +1,7 @@ + prefix=@prefix@ + exec_prefix=@exec_prefix@ +-libdir=@libdir@ +-includedir=@includedir@ ++libdir=@libdir_for_pc_file@ ++includedir=@includedir_for_pc_file@ + + Name: @PROJECT_NAME@ + Description: Unsupervised text tokenizer and detokenizer for Neural Network-based text generation. +diff --git a/third_party/absl/flags/flag.cc b/third_party/absl/flags/flag.cc +index 8e99c0d..5d6642a 100644 +--- a/third_party/absl/flags/flag.cc ++++ b/third_party/absl/flags/flag.cc +@@ -61,8 +61,8 @@ struct FlagFunc { + + namespace { + +-using FlagMap = std::map; +-using FlagList = std::vector; ++using FlagMap = std::map>; ++using FlagList = std::vector>; + + FlagMap *GetFlagMap() { + static auto *flag_map = new FlagMap; +@@ -111,7 +111,7 @@ std::string PrintHelp(const char *programname) { + os << PACKAGE_STRING << "\n\n"; + os << "Usage: " << programname << " [options] files\n\n"; + +- for (const auto *func : *GetFlagList()) { ++ for (auto func : *GetFlagList()) { + os << " --" << func->name << " (" << func->help << ")"; + os << " type: " << func->type << " default: " << func->default_value + << '\n'; +@@ -123,7 +123,7 @@ std::string PrintHelp(const char *programname) { + } + } // namespace + +-void RegisterFlag(const std::string &name, FlagFunc *func) { ++void RegisterFlag(const std::string &name, std::shared_ptr func) { + GetFlagList()->emplace_back(func); + GetFlagMap()->emplace(name, func); + } +@@ -140,7 +140,7 @@ Flag::Flag(const char *name, const char *type, const char *help, + func_->set_value = [this](const std::string &value) { + this->set_value_as_str(value); + }; +- RegisterFlag(name, func_.get()); ++ RegisterFlag(name, func_); + } + + template +@@ -219,4 +219,14 @@ std::vector ParseCommandLine(int argc, char *argv[]) { + + return output_args; + } ++ ++void CleanupFlags() { ++ static bool is_shutdown = false; ++ if (!is_shutdown) { ++ delete internal::GetFlagList(); ++ delete internal::GetFlagMap(); ++ is_shutdown = true; ++ } ++} ++ + } // namespace absl +diff --git a/third_party/absl/flags/flag.h b/third_party/absl/flags/flag.h +index e540edf..c522358 100644 +--- a/third_party/absl/flags/flag.h ++++ b/third_party/absl/flags/flag.h +@@ -24,7 +24,8 @@ namespace absl { + namespace internal { + struct FlagFunc; + +-void RegisterFlag(const std::string &name, FlagFunc *func); ++void RegisterFlag(const std::string &name, std::shared_ptr func); ++ + } // namespace internal + + template +@@ -39,7 +40,7 @@ class Flag { + + private: + T value_; +- std::unique_ptr func_; ++ std::shared_ptr func_; + }; + + template +@@ -52,6 +53,11 @@ void SetFlag(Flag *flag, const V &v) { + const T value(v); + flag->set_value(value); + } ++ ++#define HAS_ABSL_CLEANUP_FLAGS ++ ++void CleanupFlags(); ++ + } // namespace absl + + #define ABSL_FLAG(Type, name, defautl_value, help) \ diff --git a/patches/disable-static-library.patch b/patches/disable-static-library.patch new file mode 100644 index 0000000..7dba0b1 --- /dev/null +++ b/patches/disable-static-library.patch @@ -0,0 +1,44 @@ +From: Kentaro Hayashi +Date: Wed, 28 Oct 2020 20:55:20 +0900 +Subject: Disable static library explicitly + +--- + src/CMakeLists.txt | 11 +---------- + 1 file changed, 1 insertion(+), 10 deletions(-) + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 6cb3922..e4a23ac 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -204,12 +204,6 @@ if (SPM_ENABLE_SHARED) + add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) + endif() + +-add_library(sentencepiece-static STATIC ${SPM_SRCS}) +-add_library(sentencepiece_train-static STATIC ${SPM_TRAIN_SRCS}) +- +-target_link_libraries(sentencepiece-static INTERFACE ${SPM_LIBS}) +-target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static ${SPM_LIBS}) +- + if (SPM_ENABLE_SHARED) + target_link_libraries(sentencepiece ${SPM_LIBS}) + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) +@@ -220,7 +214,7 @@ if (SPM_ENABLE_SHARED) + (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4")) + list(APPEND SPM_LIBS "atomic") + endif() +- set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) ++ set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train) + set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) + set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) + set_target_properties(sentencepiece_train PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) +@@ -237,9 +231,6 @@ else() + set(SPM_INSTALLTARGETS sentencepiece-static sentencepiece_train-static) + endif() + +-set_target_properties(sentencepiece-static PROPERTIES OUTPUT_NAME "sentencepiece") +-set_target_properties(sentencepiece_train-static PROPERTIES OUTPUT_NAME "sentencepiece_train") +- + if (NOT MSVC) + if (SPM_COVERAGE) + set(CMAKE_CXX_FLAGS "-O0 -Wall -fPIC -coverage ${CMAKE_CXX_FLAGS}") diff --git a/patches/header-dependencies.patch b/patches/header-dependencies.patch new file mode 100644 index 0000000..2823de7 --- /dev/null +++ b/patches/header-dependencies.patch @@ -0,0 +1,27 @@ +From: Kentaro Hayashi +Date: Mon, 21 Nov 2022 22:17:18 +0900 +Subject: Include necessary headers to ensure IS_BIG_ENDIAN is defined + +normalizer.h uses IS_BIG_ENDIAN, which is defined in util.h. +Include util.h here. + +Author: Steve Langasek +Last-Update: 2022-08-27 +Forwarded: no +Bug-Debian: https://bugs.debian.org/1017360 +--- + src/normalizer.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/normalizer.h b/src/normalizer.h +index c79813c..37fdb8a 100644 +--- a/src/normalizer.h ++++ b/src/normalizer.h +@@ -22,6 +22,7 @@ + #include + + #include "common.h" ++#include "util.h" + #include "sentencepiece_model.pb.h" + #include "sentencepiece_processor.h" + #include "third_party/absl/strings/string_view.h" diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..c300531 --- /dev/null +++ b/patches/series @@ -0,0 +1,27 @@ +0001-update-python-wrapper.patch +0002-remove-debug-symbols-from-wheel-package.patch +0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch +0004-add-test-to-use-tab-as-user-defined-symbols.patch +0005-Uses-C-17-by-default.patch +0006-Uses-std-atomic-to-define-global-variable.patch +0007-Fix-a-typo.patch +0008-Uses-absl-string_view-as-much-as-possible.patch +0009-Fixed-build-break.patch +0010-Added-ImmutableSentencePiece-class.patch +0011-add-verbose-option.patch +0012-Supports-ImmutableSentencePieceText-from-python-modu.patch +0013-Adds-more-unittests.patch +0014-Adds-SWIGPYTHON-flag.patch +0015-remove-unused-ifdef-SWIG-macro.patch +0016-Fixed-test-failure.patch +0017-Uses-property-in-immutable-proto.patch +0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch +0019-support-slice-in-pieces-nbests-objects.patch +0020-Updated-the-document.patch +0021-Fixed-errors-in-example-notebook.patch +0022-Fix-dead-links.patch +0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch +0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch +disable-static-library.patch +support-python-module-in-place.patch +header-dependencies.patch diff --git a/patches/support-python-module-in-place.patch b/patches/support-python-module-in-place.patch new file mode 100644 index 0000000..a6ab729 --- /dev/null +++ b/patches/support-python-module-in-place.patch @@ -0,0 +1,58 @@ +From: Kentaro Hayashi +Date: Mon, 21 Nov 2022 22:13:33 +0900 +Subject: Support to build Python module without pkg-config + +--- + python/setup.py | 36 ++++++++++++++++++++---------------- + 1 file changed, 20 insertions(+), 16 deletions(-) + +diff --git a/python/setup.py b/python/setup.py +index fdf9394..5170d9a 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -77,25 +77,29 @@ class build_ext(_build_ext): + """Override build_extension to run cmake.""" + + def build_extension(self, ext): +- cflags, libs = get_cflags_and_libs('../build/root') +- if len(libs) == 0: +- cflags, libs = get_cflags_and_libs('./bundled/root') +- +- if len(libs) == 0: +- if is_sentencepiece_installed(): +- cflags = cflags + run_pkg_config('cflags') +- libs = run_pkg_config('libs') +- else: +- subprocess.check_call(['./build_bundled.sh', __version__]) +- cflags, libs = get_cflags_and_libs('./bundled/root') ++ # cflags, libs = get_cflags_and_libs('../build/root') ++ # if len(libs) == 0: ++ # cflags, libs = get_cflags_and_libs('./bundled/root') ++ ++ # if len(libs) == 0: ++ # if is_sentencepiece_installed(): ++ # cflags = cflags + run_pkg_config('cflags') ++ # libs = run_pkg_config('libs') ++ # else: ++ # subprocess.check_call(['./build_bundled.sh', __version__]) ++ # cflags, libs = get_cflags_and_libs('./bundled/root') + + # Fix compile on some versions of Mac OSX + # See: https://github.com/neulab/xnmt/issues/199 +- if sys.platform == 'darwin': +- cflags.append('-mmacosx-version-min=10.9') +- else: +- cflags.append('-Wl,-strip-all') +- libs.append('-Wl,-strip-all') ++ # if sys.platform == 'darwin': ++ # cflags.append('-mmacosx-version-min=10.9') ++ # else: ++ # cflags.append('-Wl,-strip-all') ++ # libs.append('-Wl,-strip-all') ++ cflags = ['-I../src'] ++ cmd = "dpkg-architecture -q DEB_BUILD_GNU_TYPE" ++ arch = subprocess.check_output(cmd, shell=True).decode("utf-8").strip().split()[0] ++ libs = ["-L../obj-%s/src" % arch, "-lsentencepiece", "-lsentencepiece_train"] + print('## cflags={}'.format(' '.join(cflags))) + print('## libs={}'.format(' '.join(libs))) + ext.extra_compile_args = cflags diff --git a/python3-sentencepiece.install b/python3-sentencepiece.install new file mode 100644 index 0000000..0cde274 --- /dev/null +++ b/python3-sentencepiece.install @@ -0,0 +1 @@ +usr/lib/python3.*/ diff --git a/rules b/rules new file mode 100755 index 0000000..e0dcf54 --- /dev/null +++ b/rules @@ -0,0 +1,41 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 +export DEB_BUILD_MAINT_OPTIONS = hardening=+all +DPKG_EXPORT_BUILDFLAGS = 1 +include /usr/share/dpkg/buildflags.mk + +ifneq (,$(filter $(DEB_HOST_ARCH), armel mipsel m68k powerpc sh4)) + export DEB_LDFLAGS_MAINT_APPEND += -Wl,--no-as-needed -latomic -Wl,--as-needed +endif + +%: + dh $@ --with python3 --buildsystem=cmake + +override_dh_auto_configure: + dh_auto_configure --buildsystem=cmake + dh_auto_configure --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_build: + dh_auto_build --buildsystem=cmake + dh_auto_build --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_install: basedir=$(shell pwd)/debian +override_dh_auto_install: + dh_auto_install --buildsystem=cmake + dh_auto_install --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_clean: + dh_auto_clean --buildsystem=cmake + -rm -rf .pybuild + -rm -rf .python/sentencepiece.egg-info + +# Do no tests. +override_dh_auto_test: diff --git a/salsa-ci.yml b/salsa-ci.yml new file mode 100644 index 0000000..1d8d33b --- /dev/null +++ b/salsa-ci.yml @@ -0,0 +1,7 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml + +reprotest: + allow_failure: true diff --git a/sentencepiece.docs b/sentencepiece.docs new file mode 100644 index 0000000..8d15174 --- /dev/null +++ b/sentencepiece.docs @@ -0,0 +1 @@ +doc/*.md diff --git a/sentencepiece.install b/sentencepiece.install new file mode 100644 index 0000000..1df36c6 --- /dev/null +++ b/sentencepiece.install @@ -0,0 +1 @@ +usr/bin/* diff --git a/sentencepiece.xml b/sentencepiece.xml new file mode 100644 index 0000000..2a81db2 --- /dev/null +++ b/sentencepiece.xml @@ -0,0 +1,291 @@ + +.
will be generated. You may view the +manual page with: nroff -man .
| less'. A typical entry +in a Makefile or Makefile.am is: + +DB2MAN = /usr/share/sgml/docbook/stylesheet/xsl/nwalsh/manpages/docbook.xsl +XP = xsltproc -''-nonet -''-param man.charmap.use.subset "0" + +manpage.1: manpage.xml + $(XP) $(DB2MAN) $< + +The xsltproc binary is found in the xsltproc package. The XSL files are in +docbook-xsl. A description of the parameters you can use can be found in the +docbook-xsl-doc-* packages. Please remember that if you create the nroff +version in one of the debian/rules file targets (such as build), you will need +to include xsltproc and docbook-xsl in your Build-Depends control field. +Alternatively use the xmlto command/package. That will also automatically +pull in xsltproc and docbook-xsl. + +Notes for using docbook2x: docbook2x-man does not automatically create the +AUTHOR(S) and COPYRIGHT sections. In this case, please add them manually as + ... . + +To disable the automatic creation of the AUTHOR(S) and COPYRIGHT sections +read /usr/share/doc/docbook-xsl/doc/manpages/authors.html. This file can be +found in the docbook-xsl-doc-html package. + +Validation can be done using: `xmllint -''-noout -''-valid manpage.xml` + +General documentation about man-pages and man-page-formatting: +man(1), man(7), http://www.tldp.org/HOWTO/Man-Page/ + +--> + + + + + + + + + + + + + +]> + + + + &dhtitle; + &dhpackage; + + + &dhfirstname; + &dhsurname; + Wrote this manpage for the Debian system. +
+ &dhemail; +
+
+
+ + 2007 + &dhusername; + + + This manual page was written for the Debian system + (but may be used by others). + Permission is granted to copy, distribute and/or modify this + document under the terms of the GNU General Public License, + Version 2 or (at your option) any later version published by + the Free Software Foundation. + On Debian systems, the complete text of the GNU General Public + License can be found in + /usr/share/common-licenses/GPL. + +
+ + &dhucpackage; + &dhsection; + + + &dhpackage; + program to do something + + + + &dhpackage; + + + + + + + + + this + + + + + + + + this + that + + + + + &dhpackage; + + + + + + + + + + + + + + + + + + + DESCRIPTION + This manual page documents briefly the + &dhpackage; and bar + commands. + This manual page was written for the Debian distribution + because the original program does not have a manual page. + Instead, it has documentation in the GNU + info + 1 + format; see below. + &dhpackage; is a program that... + + + OPTIONS + The program follows the usual GNU command line syntax, + with long options starting with two dashes (`-'). A summary of + options is included below. For a complete description, see the + + info + 1 + files. + + + + + + + Does this and that. + + + + + + + Show summary of options. + + + + + + + Show version of program. + + + + + + FILES + + + /etc/foo.conf + + The system-wide configuration file to control the + behaviour of &dhpackage;. See + + foo.conf + 5 + for further details. + + + + ${HOME}/.foo.conf + + The per-user configuration file to control the + behaviour of &dhpackage;. See + + foo.conf + 5 + for further details. + + + + + + ENVIONMENT + + + FOO_CONF + + If used, the defined file is used as configuration + file (see also ). + + + + + + DIAGNOSTICS + The following diagnostics may be issued + on stderr: + + + Bad configuration file. Exiting. + + The configuration file seems to contain a broken configuration + line. Use the option, to get more info. + + + + + &dhpackage; provides some return codes, that can + be used in scripts: + + Code + Diagnostic + + 0 + Program exited successfully. + + + 1 + The configuration file seems to be broken. + + + + + + BUGS + The program is currently limited to only work + with the foobar library. + The upstreams BTS can be found + at . + + + SEE ALSO + + + bar + 1 + , + baz + 1 + , + foo.conf + 5 + + The programs are documented fully by The Rise and + Fall of a Fooish Bar available via the + info + 1 + system. + +
+ diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/watch b/watch new file mode 100644 index 0000000..336e9c8 --- /dev/null +++ b/watch @@ -0,0 +1,4 @@ +version=4 +opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%sentencepiece-$1-Source.tar.xz%" \ + https://github.com/google/sentencepiece/tags \ + (?:.*?/)?v(\d[\d.]*)\.tar\.gz debian uupdate -- 2.30.2