From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com> Date: Tue, 9 Aug 2022 12:15:30 +0000 (+0300) Subject: Fixed errors in example notebook X-Git-Tag: archive/raspbian/0.1.97-3+rpi1^2~7 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=56bf7ff5123715c6e76ffaa99d488ae4e9a85cb6;p=sentencepiece.git Fixed errors in example notebook Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0021-Fixed-errors-in-example-notebook.patch --- diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb index 78464d1..1eb0f9c 100644 --- a/python/sentencepiece_python_module_example.ipynb +++ b/python/sentencepiece_python_module_example.ipynb @@ -216,7 +216,7 @@ "import tensorflow as tf\n", "\n", "# Assumes that m.model is stored in non-Posix file system.\n", - "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n", + "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n", "\n", "sp = spm.SentencePieceProcessor()\n", "sp.load_from_serialized_proto(serialized_model_proto)\n", @@ -265,7 +265,7 @@ }, "cell_type": "code", "source": [ - "## Example of user defined symbols\n", + "# Example of user defined symbols\n", "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=, --vocab_size=2000')\n", "\n", "sp_user = spm.SentencePieceProcessor()\n", @@ -307,7 +307,7 @@ }, "cell_type": "code", "source": [ - "## Example of control symbols\n", + "# Example of control symbols\n", "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=, --vocab_size=2000')\n", "\n", "sp_ctrl = spm.SentencePieceProcessor()\n", @@ -564,7 +564,7 @@ "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n", "sp = spm.SentencePieceProcessor()\n", "sp.load('m.model')\n", - "print(sp.decode_ids([sp.unk_id()])) " + "print(sp.decode_ids([sp.unk_id()]))" ], "execution_count": 0, "outputs": [ @@ -608,7 +608,7 @@ "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n", "for n in range(10):\n", " print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n", - " \n", + "\n", "for n in range(10):\n", " print(sp.sample_encode_as_ids('hello world', -1, 0.1))" ], @@ -858,8 +858,6 @@ }, "cell_type": "code", "source": [ - "import sentencepiece as spm\n", - "\n", "# NFKC normalization and lower casing.\n", "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n", "\n", @@ -903,11 +901,12 @@ }, "cell_type": "code", "source": [ - "def tocode(s): \n", - " out = [] \n", - " for c in s: \n", - " out.append(str(hex(ord(c))).replace('0x', 'U+')) \n", - " return ' '.join(out) \n", + "def tocode(s):\n", + " out = []\n", + " for c in s:\n", + " out.append(str(hex(ord(c))).replace('0x', 'U+'))\n", + " return ' '.join(out)\n", + "\n", "\n", "# TSV format: source Unicode code points target code points\n", "# normalize \"don't => do not, I'm => I am\"\n", @@ -923,7 +922,7 @@ "# m.model embeds the normalization rule compiled into an FST.\n", "sp.load('m.model')\n", "print(sp.encode_as_pieces(\"I'm busy\")) # normalzied to `I am busy'\n", - "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'\n" + "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'" ], "execution_count": 0, "outputs": [ @@ -1029,9 +1028,9 @@ " for piece in sp.encode_as_pieces(line):\n", " freq.setdefault(piece, 0)\n", " freq[piece] += 1\n", - " \n", + "\n", "# only uses the token appearing more than 1000 times in the training data.\n", - "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n", + "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n", "sp.set_vocabulary(vocabs)\n", "print(sp.encode_as_pieces('this is a test.'))\n", "\n", @@ -1133,20 +1132,17 @@ }, "cell_type": "code", "source": [ - "freq={}\n", + "freq = {}\n", "with open('botchan.txt', 'r') as f:\n", " for line in f:\n", " line = line.rstrip()\n", " for piece in line.split():\n", " freq.setdefault(piece, 0)\n", " freq[piece] += 1\n", - " \n", + "\n", "with open('word_freq_list.tsv', 'w') as f:\n", " for k, v in freq.items():\n", " f.write('%s\\t%d\\n' % (k, v))\n", - " \n", - "\n", - "import sentencepiece as spm\n", "\n", "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n", "sp = spm.SentencePieceProcessor()\n", @@ -1176,7 +1172,7 @@ "\n", "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n", "\n", - "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n", + "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n", "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n", "\n", "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n" @@ -1194,8 +1190,7 @@ }, "cell_type": "code", "source": [ - "!pip install protobuf\n", - "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py" + "!pip install protobuf" ], "execution_count": 0, "outputs": [ @@ -1233,8 +1228,7 @@ }, "cell_type": "code", "source": [ - "import sentencepiece_pb2\n", - "import sentencepiece as spm\n", + "from sentencepiece import sentencepiece_pb2\n", "\n", "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n", "\n",