Fixed errors in example notebook
authorAleksey Morozov <36787333+amrzv@users.noreply.github.com>
Tue, 9 Aug 2022 12:15:30 +0000 (15:15 +0300)
committerKentaro Hayashi <kenhys@xdump.org>
Mon, 21 Nov 2022 13:43:46 +0000 (13:43 +0000)
Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>
Gbp-Pq: Name 0021-Fixed-errors-in-example-notebook.patch

python/sentencepiece_python_module_example.ipynb

index 78464d1f43355d7987b40f0e6ace92ef53785b5d..1eb0f9cef7a72dca6c9592e5ef5d997066f3944d 100644 (file)
         "import tensorflow as tf\n",
         "\n",
         "# Assumes that m.model is stored in non-Posix file system.\n",
-        "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+        "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
         "\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load_from_serialized_proto(serialized_model_proto)\n",
       },
       "cell_type": "code",
       "source": [
-        "## Example of user defined symbols\n",
+        "# Example of user defined symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_user = spm.SentencePieceProcessor()\n",
       },
       "cell_type": "code",
       "source": [
-        "## Example of control symbols\n",
+        "# Example of control symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_ctrl = spm.SentencePieceProcessor()\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load('m.model')\n",
-        "print(sp.decode_ids([sp.unk_id()])) "
+        "print(sp.decode_ids([sp.unk_id()]))"
       ],
       "execution_count": 0,
       "outputs": [
         "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
-        "  \n",
+        "\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
       ],
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece as spm\n",
-        "\n",
         "# NFKC normalization and lower casing.\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
         "\n",
       },
       "cell_type": "code",
       "source": [
-        "def tocode(s):                                                                               \n",
-        "    out = []                                                                                 \n",
-        "    for c in s:                                                                              \n",
-        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))                                     \n",
-        "    return ' '.join(out)          \n",
+        "def tocode(s):\n",
+        "    out = []\n",
+        "    for c in s:\n",
+        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+        "    return ' '.join(out)\n",
+        "\n",
         "\n",
         "# TSV format:  source Unicode code points <tab> target code points\n",
         "# normalize \"don't => do not,  I'm => I am\"\n",
         "# m.model embeds the normalization rule compiled into an FST.\n",
         "sp.load('m.model')\n",
         "print(sp.encode_as_pieces(\"I'm busy\"))  # normalzied to `I am busy'\n",
-        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'\n"
+        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'"
       ],
       "execution_count": 0,
       "outputs": [
         "        for piece in sp.encode_as_pieces(line):\n",
         "            freq.setdefault(piece, 0)\n",
         "            freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "# only uses the token appearing more than 1000 times in the training data.\n",
-        "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+        "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
         "sp.set_vocabulary(vocabs)\n",
         "print(sp.encode_as_pieces('this is a test.'))\n",
         "\n",
       },
       "cell_type": "code",
       "source": [
-        "freq={}\n",
+        "freq = {}\n",
         "with open('botchan.txt', 'r') as f:\n",
         "  for line in f:\n",
         "    line = line.rstrip()\n",
         "    for piece in line.split():\n",
         "      freq.setdefault(piece, 0)\n",
         "      freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "with open('word_freq_list.tsv', 'w') as f:\n",
         "  for k, v in freq.items():\n",
         "    f.write('%s\\t%d\\n' % (k, v))\n",
-        "  \n",
-        "\n",
-        "import sentencepiece as spm\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
         "sp = spm.SentencePieceProcessor()\n",
         "\n",
         "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
         "\n",
-        "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+        "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
         "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
         "\n",
         "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
       },
       "cell_type": "code",
       "source": [
-        "!pip install protobuf\n",
-        "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+        "!pip install protobuf"
       ],
       "execution_count": 0,
       "outputs": [
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece_pb2\n",
-        "import sentencepiece as spm\n",
+        "from sentencepiece import sentencepiece_pb2\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
         "\n",