"import tensorflow as tf\n",
"\n",
"# Assumes that m.model is stored in non-Posix file system.\n",
- "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+ "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
"\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load_from_serialized_proto(serialized_model_proto)\n",
},
"cell_type": "code",
"source": [
- "## Example of user defined symbols\n",
+ "# Example of user defined symbols\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
"\n",
"sp_user = spm.SentencePieceProcessor()\n",
},
"cell_type": "code",
"source": [
- "## Example of control symbols\n",
+ "# Example of control symbols\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
"\n",
"sp_ctrl = spm.SentencePieceProcessor()\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load('m.model')\n",
- "print(sp.decode_ids([sp.unk_id()])) "
+ "print(sp.decode_ids([sp.unk_id()]))"
],
"execution_count": 0,
"outputs": [
"# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
"for n in range(10):\n",
" print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
- " \n",
+ "\n",
"for n in range(10):\n",
" print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
],
},
"cell_type": "code",
"source": [
- "import sentencepiece as spm\n",
- "\n",
"# NFKC normalization and lower casing.\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
"\n",
},
"cell_type": "code",
"source": [
- "def tocode(s): \n",
- " out = [] \n",
- " for c in s: \n",
- " out.append(str(hex(ord(c))).replace('0x', 'U+')) \n",
- " return ' '.join(out) \n",
+ "def tocode(s):\n",
+ " out = []\n",
+ " for c in s:\n",
+ " out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+ " return ' '.join(out)\n",
+ "\n",
"\n",
"# TSV format: source Unicode code points <tab> target code points\n",
"# normalize \"don't => do not, I'm => I am\"\n",
"# m.model embeds the normalization rule compiled into an FST.\n",
"sp.load('m.model')\n",
"print(sp.encode_as_pieces(\"I'm busy\")) # normalzied to `I am busy'\n",
- "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'\n"
+ "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'"
],
"execution_count": 0,
"outputs": [
" for piece in sp.encode_as_pieces(line):\n",
" freq.setdefault(piece, 0)\n",
" freq[piece] += 1\n",
- " \n",
+ "\n",
"# only uses the token appearing more than 1000 times in the training data.\n",
- "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+ "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
"sp.set_vocabulary(vocabs)\n",
"print(sp.encode_as_pieces('this is a test.'))\n",
"\n",
},
"cell_type": "code",
"source": [
- "freq={}\n",
+ "freq = {}\n",
"with open('botchan.txt', 'r') as f:\n",
" for line in f:\n",
" line = line.rstrip()\n",
" for piece in line.split():\n",
" freq.setdefault(piece, 0)\n",
" freq[piece] += 1\n",
- " \n",
+ "\n",
"with open('word_freq_list.tsv', 'w') as f:\n",
" for k, v in freq.items():\n",
" f.write('%s\\t%d\\n' % (k, v))\n",
- " \n",
- "\n",
- "import sentencepiece as spm\n",
"\n",
"spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
"sp = spm.SentencePieceProcessor()\n",
"\n",
"Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
"\n",
- "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+ "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
"**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
"\n",
"The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
},
"cell_type": "code",
"source": [
- "!pip install protobuf\n",
- "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+ "!pip install protobuf"
],
"execution_count": 0,
"outputs": [
},
"cell_type": "code",
"source": [
- "import sentencepiece_pb2\n",
- "import sentencepiece as spm\n",
+ "from sentencepiece import sentencepiece_pb2\n",
"\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
"\n",