Fixed errors in example notebook

author Aleksey Morozov <36787333+amrzv@users.noreply.github.com>

Tue, 9 Aug 2022 12:15:30 +0000 (15:15 +0300)

committer Kentaro Hayashi <kenhys@xdump.org>

Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
author Aleksey Morozov <36787333+amrzv@users.noreply.github.com>
Tue, 9 Aug 2022 12:15:30 +0000 (15:15 +0300)
committer Kentaro Hayashi <kenhys@xdump.org>
Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb

index 78464d1f43355d7987b40f0e6ace92ef53785b5d..1eb0f9cef7a72dca6c9592e5ef5d997066f3944d 100644 (file)
--- a/python/sentencepiece_python_module_example.ipynb
+++ b/python/sentencepiece_python_module_example.ipynb
@@ -216,7 +216,7 @@
          "import tensorflow as tf\n",
          "\n",
          "# Assumes that m.model is stored in non-Posix file system.\n",
-        "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+        "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
          "\n",
          "sp = spm.SentencePieceProcessor()\n",
          "sp.load_from_serialized_proto(serialized_model_proto)\n",
@@ -265,7 +265,7 @@
        },
        "cell_type": "code",
        "source": [
-        "## Example of user defined symbols\n",
+        "# Example of user defined symbols\n",
          "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
          "\n",
          "sp_user = spm.SentencePieceProcessor()\n",
@@ -307,7 +307,7 @@
        },
        "cell_type": "code",
        "source": [
-        "## Example of control symbols\n",
+        "# Example of control symbols\n",
          "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
          "\n",
          "sp_ctrl = spm.SentencePieceProcessor()\n",
@@ -564,7 +564,7 @@
          "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
          "sp = spm.SentencePieceProcessor()\n",
          "sp.load('m.model')\n",
-        "print(sp.decode_ids([sp.unk_id()])) "
+        "print(sp.decode_ids([sp.unk_id()]))"
        ],
        "execution_count": 0,
        "outputs": [
@@ -608,7 +608,7 @@
          "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
          "for n in range(10):\n",
          "  print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
-        "  \n",
+        "\n",
          "for n in range(10):\n",
          "  print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
        ],
@@ -858,8 +858,6 @@
        },
        "cell_type": "code",
        "source": [
-        "import sentencepiece as spm\n",
-        "\n",
          "# NFKC normalization and lower casing.\n",
          "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
          "\n",
@@ -903,11 +901,12 @@
        },
        "cell_type": "code",
        "source": [
-        "def tocode(s):                                                                               \n",
-        "    out = []                                                                                 \n",
-        "    for c in s:                                                                              \n",
-        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))                                     \n",
-        "    return ' '.join(out)          \n",
+        "def tocode(s):\n",
+        "    out = []\n",
+        "    for c in s:\n",
+        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+        "    return ' '.join(out)\n",
+        "\n",
          "\n",
          "# TSV format:  source Unicode code points <tab> target code points\n",
          "# normalize \"don't => do not,  I'm => I am\"\n",
@@ -923,7 +922,7 @@
          "# m.model embeds the normalization rule compiled into an FST.\n",
          "sp.load('m.model')\n",
          "print(sp.encode_as_pieces(\"I'm busy\"))  # normalzied to `I am busy'\n",
-        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'\n"
+        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'"
        ],
        "execution_count": 0,
        "outputs": [
@@ -1029,9 +1028,9 @@
          "        for piece in sp.encode_as_pieces(line):\n",
          "            freq.setdefault(piece, 0)\n",
          "            freq[piece] += 1\n",
-        "            \n",
+        "\n",
          "# only uses the token appearing more than 1000 times in the training data.\n",
-        "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+        "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
          "sp.set_vocabulary(vocabs)\n",
          "print(sp.encode_as_pieces('this is a test.'))\n",
          "\n",
@@ -1133,20 +1132,17 @@
        },
        "cell_type": "code",
        "source": [
-        "freq={}\n",
+        "freq = {}\n",
          "with open('botchan.txt', 'r') as f:\n",
          "  for line in f:\n",
          "    line = line.rstrip()\n",
          "    for piece in line.split():\n",
          "      freq.setdefault(piece, 0)\n",
          "      freq[piece] += 1\n",
-        "            \n",
+        "\n",
          "with open('word_freq_list.tsv', 'w') as f:\n",
          "  for k, v in freq.items():\n",
          "    f.write('%s\\t%d\\n' % (k, v))\n",
-        "  \n",
-        "\n",
-        "import sentencepiece as spm\n",
          "\n",
          "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
          "sp = spm.SentencePieceProcessor()\n",
@@ -1176,7 +1172,7 @@
          "\n",
          "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
          "\n",
-        "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+        "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
          "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
          "\n",
          "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
@@ -1194,8 +1190,7 @@
        },
        "cell_type": "code",
        "source": [
-        "!pip install protobuf\n",
-        "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+        "!pip install protobuf"
        ],
        "execution_count": 0,
        "outputs": [
@@ -1233,8 +1228,7 @@
        },
        "cell_type": "code",
        "source": [
-        "import sentencepiece_pb2\n",
-        "import sentencepiece as spm\n",
+        "from sentencepiece import sentencepiece_pb2\n",
          "\n",
          "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
          "\n",
author	Aleksey Morozov <36787333+amrzv@users.noreply.github.com>
	Tue, 9 Aug 2022 12:15:30 +0000 (15:15 +0300)
committer	Kentaro Hayashi <kenhys@xdump.org>
	Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)