mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
890 lines
304 KiB
Python
890 lines
304 KiB
Python
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "D7tqLMoKF6uq"
|
||
|
},
|
||
|
"source": [
|
||
|
"Deep Learning with TensorFlow\n",
|
||
|
"=============\n",
|
||
|
"\n",
|
||
|
"Credits: Forked from [TensorFlow](https://github.com/tensorflow/tensorflow) by Google\n",
|
||
|
"\n",
|
||
|
"Setup\n",
|
||
|
"------------\n",
|
||
|
"\n",
|
||
|
"Refer to the [setup instructions](https://github.com/donnemartin/data-science-ipython-notebooks/tree/feature/deep-learning/deep-learning/tensor-flow-exercises/README.md).\n",
|
||
|
"\n",
|
||
|
"Exercise 5\n",
|
||
|
"------------\n",
|
||
|
"\n",
|
||
|
"The goal of this exercise is to train a skip-gram model over [Text8](http://mattmahoney.net/dc/textdata) data."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
}
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": true,
|
||
|
"id": "0K1ZyLn04QZf"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# These are all the modules we'll be using later. Make sure you can import them\n",
|
||
|
"# before proceeding further.\n",
|
||
|
"import collections\n",
|
||
|
"import math\n",
|
||
|
"import numpy as np\n",
|
||
|
"import os\n",
|
||
|
"import random\n",
|
||
|
"import tensorflow as tf\n",
|
||
|
"import urllib\n",
|
||
|
"import zipfile\n",
|
||
|
"from matplotlib import pylab\n",
|
||
|
"from sklearn.manifold import TSNE"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "aCjPJE944bkV"
|
||
|
},
|
||
|
"source": [
|
||
|
"Download the data from the source website if necessary."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 1
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 14640,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445964482948,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "RJ-o3UBUFtCw",
|
||
|
"outputId": "c4ec222c-80b5-4298-e635-93ca9f79c3b7"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Found and verified text8.zip\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"url = 'http://mattmahoney.net/dc/'\n",
|
||
|
"\n",
|
||
|
"def maybe_download(filename, expected_bytes):\n",
|
||
|
" \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n",
|
||
|
" if not os.path.exists(filename):\n",
|
||
|
" filename, _ = urllib.urlretrieve(url + filename, filename)\n",
|
||
|
" statinfo = os.stat(filename)\n",
|
||
|
" if statinfo.st_size == expected_bytes:\n",
|
||
|
" print 'Found and verified', filename\n",
|
||
|
" else:\n",
|
||
|
" print statinfo.st_size\n",
|
||
|
" raise Exception(\n",
|
||
|
" 'Failed to verify ' + filename + '. Can you get to it with a browser?')\n",
|
||
|
" return filename\n",
|
||
|
"\n",
|
||
|
"filename = maybe_download('text8.zip', 31344016)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "Zqz3XiqI4mZT"
|
||
|
},
|
||
|
"source": [
|
||
|
"Read the data into a string."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 1
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 28844,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445964497165,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "Mvf09fjugFU_",
|
||
|
"outputId": "e3a928b4-1645-4fe8-be17-fcf47de5716d"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Data size 17005207\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def read_data(filename):\n",
|
||
|
" f = zipfile.ZipFile(filename)\n",
|
||
|
" for name in f.namelist():\n",
|
||
|
" return f.read(name).split()\n",
|
||
|
" f.close()\n",
|
||
|
" \n",
|
||
|
"words = read_data(filename)\n",
|
||
|
"print 'Data size', len(words)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "Zdw6i4F8glpp"
|
||
|
},
|
||
|
"source": [
|
||
|
"Build the dictionary and replace rare words with UNK token."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 1
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 28849,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445964497178,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "gAL1EECXeZsD",
|
||
|
"outputId": "3fb4ecd1-df67-44b6-a2dc-2291730970b2"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]\n",
|
||
|
"Sample data [5243, 3083, 12, 6, 195, 2, 3136, 46, 59, 156]\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"vocabulary_size = 50000\n",
|
||
|
"\n",
|
||
|
"def build_dataset(words):\n",
|
||
|
" count = [['UNK', -1]]\n",
|
||
|
" count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
|
||
|
" dictionary = dict()\n",
|
||
|
" for word, _ in count:\n",
|
||
|
" dictionary[word] = len(dictionary)\n",
|
||
|
" data = list()\n",
|
||
|
" unk_count = 0\n",
|
||
|
" for word in words:\n",
|
||
|
" if word in dictionary:\n",
|
||
|
" index = dictionary[word]\n",
|
||
|
" else:\n",
|
||
|
" index = 0 # dictionary['UNK']\n",
|
||
|
" unk_count = unk_count + 1\n",
|
||
|
" data.append(index)\n",
|
||
|
" count[0][1] = unk_count\n",
|
||
|
" reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) \n",
|
||
|
" return data, count, dictionary, reverse_dictionary\n",
|
||
|
"\n",
|
||
|
"data, count, dictionary, reverse_dictionary = build_dataset(words)\n",
|
||
|
"print 'Most common words (+UNK)', count[:5]\n",
|
||
|
"print 'Sample data', data[:10]\n",
|
||
|
"del words # Hint to reduce memory."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "lFwoyygOmWsL"
|
||
|
},
|
||
|
"source": [
|
||
|
"Function to generate a training batch for the skip-gram model."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 1
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 113,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445964901989,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "w9APjA-zmfjV",
|
||
|
"outputId": "67cccb02-cdaf-4e47-d489-43bcc8d57bb8"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
" 3083 -> 5243\n",
|
||
|
"originated -> anarchism\n",
|
||
|
"3083 -> 12\n",
|
||
|
"originated -> as\n",
|
||
|
"12 -> 3083\n",
|
||
|
"as -> originated\n",
|
||
|
"12 -> 6\n",
|
||
|
"as -> a\n",
|
||
|
"6 -> 12\n",
|
||
|
"a -> as\n",
|
||
|
"6 -> 195\n",
|
||
|
"a -> term\n",
|
||
|
"195 -> 6\n",
|
||
|
"term -> a\n",
|
||
|
"195 -> 2\n",
|
||
|
"term -> of\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data_index = 0\n",
|
||
|
"\n",
|
||
|
"def generate_batch(batch_size, num_skips, skip_window):\n",
|
||
|
" global data_index\n",
|
||
|
" assert batch_size % num_skips == 0\n",
|
||
|
" assert num_skips <= 2 * skip_window\n",
|
||
|
" batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
|
||
|
" labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
|
||
|
" span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n",
|
||
|
" buffer = collections.deque(maxlen=span)\n",
|
||
|
" for _ in range(span):\n",
|
||
|
" buffer.append(data[data_index])\n",
|
||
|
" data_index = (data_index + 1) % len(data)\n",
|
||
|
" for i in range(batch_size / num_skips):\n",
|
||
|
" target = skip_window # target label at the center of the buffer\n",
|
||
|
" targets_to_avoid = [ skip_window ]\n",
|
||
|
" for j in range(num_skips):\n",
|
||
|
" while target in targets_to_avoid:\n",
|
||
|
" target = random.randint(0, span - 1)\n",
|
||
|
" targets_to_avoid.append(target)\n",
|
||
|
" batch[i * num_skips + j] = buffer[skip_window]\n",
|
||
|
" labels[i * num_skips + j, 0] = buffer[target]\n",
|
||
|
" buffer.append(data[data_index])\n",
|
||
|
" data_index = (data_index + 1) % len(data)\n",
|
||
|
" return batch, labels\n",
|
||
|
"\n",
|
||
|
"batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n",
|
||
|
"for i in range(8):\n",
|
||
|
" print batch[i], '->', labels[i, 0]\n",
|
||
|
" print reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"colab_type": "text",
|
||
|
"id": "Ofd1MbBuwiva"
|
||
|
},
|
||
|
"source": [
|
||
|
"Train a skip-gram model."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
}
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": true,
|
||
|
"id": "8pQKsV4Vwlzy"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"batch_size = 128\n",
|
||
|
"embedding_size = 128 # Dimension of the embedding vector.\n",
|
||
|
"skip_window = 1 # How many words to consider left and right.\n",
|
||
|
"num_skips = 2 # How many times to reuse an input to generate a label.\n",
|
||
|
"# We pick a random validation set to sample nearest neighbors. here we limit the\n",
|
||
|
"# validation samples to the words that have a low numeric ID, which by\n",
|
||
|
"# construction are also the most frequent. \n",
|
||
|
"valid_size = 16 # Random set of words to evaluate similarity on.\n",
|
||
|
"valid_window = 100 # Only pick dev samples in the head of the distribution.\n",
|
||
|
"valid_examples = np.array(random.sample(xrange(valid_window), valid_size))\n",
|
||
|
"num_sampled = 64 # Number of negative examples to sample.\n",
|
||
|
"\n",
|
||
|
"graph = tf.Graph()\n",
|
||
|
"\n",
|
||
|
"with graph.as_default():\n",
|
||
|
"\n",
|
||
|
" # Input data.\n",
|
||
|
" train_dataset = tf.placeholder(tf.int32, shape=[batch_size])\n",
|
||
|
" train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
|
||
|
" valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
|
||
|
" \n",
|
||
|
" # Variables.\n",
|
||
|
" embeddings = tf.Variable(\n",
|
||
|
" tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
|
||
|
" softmax_weights = tf.Variable(\n",
|
||
|
" tf.truncated_normal([vocabulary_size, embedding_size],\n",
|
||
|
" stddev=1.0 / math.sqrt(embedding_size)))\n",
|
||
|
" softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
|
||
|
" \n",
|
||
|
" # Model.\n",
|
||
|
" # Look up embeddings for inputs.\n",
|
||
|
" embed = tf.nn.embedding_lookup(embeddings, train_dataset)\n",
|
||
|
" # Compute the softmax loss, using a sample of the negative labels each time.\n",
|
||
|
" loss = tf.reduce_mean(\n",
|
||
|
" tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,\n",
|
||
|
" train_labels, num_sampled, vocabulary_size))\n",
|
||
|
"\n",
|
||
|
" # Optimizer.\n",
|
||
|
" optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)\n",
|
||
|
" \n",
|
||
|
" # Compute the similarity between minibatch examples and all embeddings.\n",
|
||
|
" # We use the cosine distance:\n",
|
||
|
" norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
|
||
|
" normalized_embeddings = embeddings / norm\n",
|
||
|
" valid_embeddings = tf.nn.embedding_lookup(\n",
|
||
|
" normalized_embeddings, valid_dataset)\n",
|
||
|
" similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 23
|
||
|
},
|
||
|
{
|
||
|
"item_id": 48
|
||
|
},
|
||
|
{
|
||
|
"item_id": 61
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 436189,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445965429787,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "1bQFGceBxrWW",
|
||
|
"outputId": "5ebd6d9a-33c6-4bcd-bf6d-252b0b6055e4"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Initialized\n",
|
||
|
"Average loss at step 0 : 8.58149623871\n",
|
||
|
"Nearest to been: unfavourably, marmara, ancestral, legal, bogart, glossaries, worst, rooms,\n",
|
||
|
"Nearest to time: conformist, strawberries, sindhi, waterfall, xia, nominates, psp, sensitivity,\n",
|
||
|
"Nearest to over: overlord, panda, golden, semigroup, rawlings, involved, shreveport, handling,\n",
|
||
|
"Nearest to not: hymenoptera, reintroducing, lamiaceae, because, davao, omnipotent, combustion, debilitating,\n",
|
||
|
"Nearest to three: catalog, koza, gn, braque, holstein, postgresql, luddite, justine,\n",
|
||
|
"Nearest to if: chilled, vince, fiddler, represented, sandinistas, happiness, lya, glands,\n",
|
||
|
"Nearest to there: coast, photosynthetic, kimmei, legally, inner, illyricum, formats, fullmetal,\n",
|
||
|
"Nearest to between: chuvash, prinz, suitability, wolfe, guideline, computability, diminutive, paulo,\n",
|
||
|
"Nearest to from: tanganyika, workshop, elphinstone, spearhead, resurrected, kevlar, shangri, loves,\n",
|
||
|
"Nearest to state: sextus, wuppertal, glaring, inches, unrounded, courageous, adler, connie,\n",
|
||
|
"Nearest to on: gino, phocas, rhine, jg, macrocosm, jackass, jays, theorie,\n",
|
||
|
"Nearest to and: standings, towed, reyes, willard, equality, juggling, wladislaus, faked,\n",
|
||
|
"Nearest to eight: gresham, dogg, moko, tennis, superseded, telegraphy, scramble, vinod,\n",
|
||
|
"Nearest to they: prisons, divisor, coder, ribeira, willingness, factional, nne, lotta,\n",
|
||
|
"Nearest to more: blues, fur, sterling, tangier, khwarizmi, discouraged, cal, deicide,\n",
|
||
|
"Nearest to other: enemies, bogged, brassicaceae, lascaux, dispense, alexandrians, crimea, dou,\n",
|
||
|
"Average loss at step 2000 : 4.39983723116\n",
|
||
|
"Average loss at step 4000 : 3.86921076906\n",
|
||
|
"Average loss at step 6000 : 3.72542127335\n",
|
||
|
"Average loss at step 8000 : 3.57835536212\n",
|
||
|
"Average loss at step 10000 : 3.61056993055\n",
|
||
|
"Nearest to been: glossaries, legal, unfavourably, be, hadad, wore, scarcity, were,\n",
|
||
|
"Nearest to time: strawberries, conformist, gleichschaltung, waterfall, molality, nominates, baal, dole,\n",
|
||
|
"Nearest to over: golden, semigroup, catus, motorways, brick, shehri, mussolini, overlord,\n",
|
||
|
"Nearest to not: hinayana, it, often, they, boots, also, noaa, lindsey,\n",
|
||
|
"Nearest to three: four, seven, six, five, nine, eight, two, zero,\n",
|
||
|
"Nearest to if: glands, euros, wallpaper, redefine, toho, confuse, unsound, shepherd,\n",
|
||
|
"Nearest to there: it, they, fullmetal, pace, legally, harpsichord, mma, bug,\n",
|
||
|
"Nearest to between: chuvash, wandering, from, kirsch, pursuant, eurocents, suitability, jackie,\n",
|
||
|
"Nearest to from: into, in, workshop, to, at, misogynist, elphinstone, spearhead,\n",
|
||
|
"Nearest to state: sextus, glaring, connie, adler, esoteric, didactic, handedness, presidents,\n",
|
||
|
"Nearest to on: in, at, for, ruminants, wakefulness, torrey, foley, gino,\n",
|
||
|
"Nearest to and: or, who, but, zelda, of, for, thirst, chisel,\n",
|
||
|
"Nearest to eight: nine, six, seven, five, four, three, zero, two,\n",
|
||
|
"Nearest to they: he, prisons, there, we, hydrate, it, not, cumbersome,\n",
|
||
|
"Nearest to more: skye, blues, trypomastigotes, deicide, most, readable, used, sterling,\n",
|
||
|
"Nearest to other: trochaic, hush, surveyors, joachim, differentiation, attackers, reverence, attestation,\n",
|
||
|
"Average loss at step 12000 : 3.66169466591\n",
|
||
|
"Average loss at step 14000 : 3.60342905837\n",
|
||
|
"Average loss at step 16000 : 3.57761328053\n",
|
||
|
"Average loss at step 18000 : 3.57667332476\n",
|
||
|
"Average loss at step 20000 : 3.53310145146\n",
|
||
|
"Nearest to been: be, become, was, hadad, unfavourably, were, wore, partido,\n",
|
||
|
"Nearest to time: gleichschaltung, strawberries, year, nominates, conformist, etch, admittedly, treasuries,\n",
|
||
|
"Nearest to over: golden, semigroup, motorways, rawlings, triangle, trey, ustawa, mattingly,\n",
|
||
|
"Nearest to not: they, boots, often, dieppe, still, hinayana, nearly, be,\n",
|
||
|
"Nearest to three: two, four, five, seven, eight, six, nine, one,\n",
|
||
|
"Nearest to if: wallpaper, euros, before, toho, unsound, so, bg, pfc,\n",
|
||
|
"Nearest to there: they, it, he, usually, which, we, not, transactions,\n",
|
||
|
"Nearest to between: from, with, about, near, reactance, eurocents, wandering, voltaire,\n",
|
||
|
"Nearest to from: into, workshop, by, between, in, on, elphinstone, under,\n",
|
||
|
"Nearest to state: glaring, esoteric, succeeding, sextus, vorarlberg, presidents, depends, connie,\n",
|
||
|
"Nearest to on: in, at, upon, during, from, janis, foley, nubian,\n",
|
||
|
"Nearest to and: or, thirst, but, where, s, who, pfaff, including,\n",
|
||
|
"Nearest to eight: nine, seven, six, five, four, three, zero, one,\n",
|
||
|
"Nearest to they: there, he, we, not, it, you, prisons, who,\n",
|
||
|
"Nearest to more: less, most, deicide, skye, trypomastigotes, interventionism, toed, drummond,\n",
|
||
|
"Nearest to other: such, joachim, hush, attackers, surveyors, trochaic, differentiation, reverence,\n",
|
||
|
"Average loss at step 22000 : 3.59519316927\n",
|
||
|
"Average loss at step 24000 : 3.55378576797\n",
|
||
|
"Average loss at step 26000 : 3.56455037558\n",
|
||
|
"Average loss at step 28000 : 3.5040882225\n",
|
||
|
"Average loss at step 30000 : 3.39208897972\n",
|
||
|
"Nearest to been: become, be, were, was, spotless, hadad, by, hausdorff,\n",
|
||
|
"Nearest to time: gleichschaltung, year, day, nominates, jesus, strawberries, way, admittedly,\n",
|
||
|
"Nearest to over: golden, semigroup, motorways, rawlings, interventionism, counternarcotics, adaption, brick,\n",
|
||
|
"Nearest to not: often, they, it, never, still, nor, boots, pki,\n",
|
||
|
"Nearest to three: four, six, two, eight, five, seven, nine, zero,\n",
|
||
|
"Nearest to if: when, before, so, should, toho, where, bg, wallpaper,\n",
|
||
|
"Nearest to there: they, it, which, usually, he, that, also, now,\n",
|
||
|
"Nearest to between: with, from, in, panasonic, presupposes, churchmen, hijacking, where,\n",
|
||
|
"Nearest to from: into, elphinstone, workshop, between, through, speculates, sosa, in,\n",
|
||
|
"Nearest to state: esoteric, glaring, presidents, vorarlberg, atmosphere, succeeding, lute, connie,\n",
|
||
|
"Nearest to on: upon, in, janis, during, torrey, against, infield, catalans,\n",
|
||
|
"Nearest to and: or, thirst, in, but, of, sobib, cleaves, including,\n",
|
||
|
"Nearest to eight: nine, six, four, seven, three, zero, five, one,\n",
|
||
|
"Nearest to they: we, there, he, you, it, these, who, i,\n",
|
||
|
"Nearest to more: less, most, deicide, faster, toed, very, skye, tonic,\n",
|
||
|
"Nearest to other: different, attackers, joachim, various, such, many, differentiation, these,\n",
|
||
|
"Average loss at step 32000 : 3.49501452419\n",
|
||
|
"Average loss at step 34000 : 3.48593705952\n",
|
||
|
"Average loss at step 36000 : 3.50112806576\n",
|
||
|
"Average loss at step 38000 : 3.49244426501\n",
|
||
|
"Average loss at step 40000 : 3.3890105716\n",
|
||
|
"Nearest to been: become, be, were, was, jolie, hausdorff, spotless, had,\n",
|
||
|
"Nearest to time: year, way, gleichschaltung, period, day, stanislav, stage, outcome,\n",
|
||
|
"Nearest to over: through, semigroup, rawlings, golden, about, brick, on, motorways,\n",
|
||
|
"Nearest to not: they, radiated, never, pki, still, omnipotent, hinayana, really,\n",
|
||
|
"Nearest to three: four, six, five, two, seven, eight, one, nine,\n",
|
||
|
"Nearest to if: when, before, where, then, bg, because, can, should,\n",
|
||
|
"Nearest to there: they, it, he, usually, this, typically, still, often,\n",
|
||
|
"Nearest to between: with, in, from, about, against, churchmen, johansen, presupposes,\n",
|
||
|
"Nearest to from: into, through, elphinstone, in, workshop, between, suing, under,\n",
|
||
|
"Nearest to state: esoteric, presidents, atmosphere, vorarlberg, lute, succeeding, glaring, didactic,\n",
|
||
|
"Nearest to on: upon, at, in, during, unitarians, under, catalans, batavians,\n",
|
||
|
"Nearest to and: or, but, s, incapacitation, including, while, of, which,\n",
|
||
|
"Nearest to eight: nine, six, seven, four, five, three, one, two,\n",
|
||
|
"Nearest to they: we, he, there, you, she, i, not, it,\n",
|
||
|
"Nearest to more: less, most, deicide, toed, greater, faster, quite, longer,\n",
|
||
|
"Nearest to other: various, different, attackers, joachim, clutter, nz, trochaic, apulia,\n",
|
||
|
"Average loss at step 42000 : 3.45294014364\n",
|
||
|
"Average loss at step 44000 : 3.47660055941\n",
|
||
|
"Average loss at step 46000 : 3.47458503014\n",
|
||
|
"Average loss at step 48000 : 3.47261548793\n",
|
||
|
"Average loss at step 50000 : 3.45390708435\n",
|
||
|
"Nearest to been: become, be, had, was, were, hausdorff, prem, remained,\n",
|
||
|
"Nearest to time: way, year, period, stv, day, gleichschaltung, stage, outcome,\n",
|
||
|
"Nearest to over: through, golden, semigroup, about, brick, counternarcotics, theremin, mattingly,\n",
|
||
|
"Nearest to not: they, still, never, really, sometimes, it, kiwifruit, nearly,\n",
|
||
|
"Nearest to three: five, four, six, seven, two, eight, one, nine,\n",
|
||
|
"Nearest to if: when, before, where, because, connexion, though, so, whether,\n",
|
||
|
"Nearest to there: they, it, he, this, now, often, usually, still,\n",
|
||
|
"Nearest to between: with, from, fashioned, churchmen, panasonic, explores, within, racial,\n",
|
||
|
"Nearest to from: into, through, under, elphinstone, between, workshop, circumpolar, idiom,\n",
|
||
|
"Nearest to state: atmosphere, vorarlberg, esoteric, presidents, madhya, majority, moulin, bowmen,\n",
|
||
|
"Nearest to on: upon, in, catalans, tezuka, minotaurs, wakefulness, batavians, guglielmo,\n",
|
||
|
"Nearest to and: or, but, thirst, signifier, which, however, including, unattractive,\n",
|
||
|
"Nearest to eight: six, nine, seven, five, four, three, zero, two,\n",
|
||
|
"Nearest to they: we, there, he, you, it, she, these, not,\n",
|
||
|
"Nearest to more: less, most, quite, very, further, faster, toed, deicide,\n",
|
||
|
"Nearest to other: various, different, many, attackers, are, joachim, nihilo, reject,\n",
|
||
|
"Average loss at step 52000 : 3.43597227755\n",
|
||
|
"Average loss at step 54000 : 3.25126817495\n",
|
||
|
"Average loss at step 56000 : 3.35102432287\n",
|
||
|
"Average loss at step 58000 : 3.44654818082\n",
|
||
|
"Average loss at step 60000 : 3.4287913968\n",
|
||
|
"Nearest to been: become, be, was, prem, had, remained, hadad, stanislavsky,\n",
|
||
|
"Nearest to time: year, way, period, stv, barely, name, stage, restoring,\n",
|
||
|
"Nearest to over: about, through, golden, adaption, counternarcotics, up, mattingly, brick,\n",
|
||
|
"Nearest to not: still, never, nor, kiwifruit, they, nearly, therefore, rarely,\n",
|
||
|
"Nearest to three: two, five, four, six, seven, eight, one, nine,\n",
|
||
|
"Nearest to if: when, though, before, where, although, because, can, could,\n",
|
||
|
"Nearest to there: they, it, he, still, she, we, this, often,\n",
|
||
|
"Nearest to between: with, from, churchmen, among, ethical, within, vma, panasonic,\n",
|
||
|
"Nearest to from: through, into, under, during, between, in, suing, across,\n",
|
||
|
"Nearest to state: atmosphere, infringe, madhya, vorarlberg, government, bowmen, vargas, republic,\n",
|
||
|
"Nearest to on: upon, through, within, ridiculous, janis, in, under, over,\n",
|
||
|
"Nearest to and: or, while, including, but, of, like, whose, bannister,\n",
|
||
|
"Nearest to eight: nine, six, five, four, seven, zero, three, two,\n",
|
||
|
"Nearest to they: we, there, you, he, it, these, she, prisons,\n",
|
||
|
"Nearest to more: less, most, quite, further, toed, very, faster, rather,\n",
|
||
|
"Nearest to other: different, various, many, nihilo, these, amour, including, screenplays,\n",
|
||
|
"Average loss at step 62000 : 3.38358767056\n",
|
||
|
"Average loss at step 64000 : 3.41693099326\n",
|
||
|
"Average loss at step 66000 : 3.39588000977\n",
|
||
|
"Average loss at step 68000 : 3.35567189544\n",
|
||
|
"Average loss at step 70000 : 3.38878934443\n",
|
||
|
"Nearest to been: become, be, was, prem, remained, were, being, discounts,\n",
|
||
|
"Nearest to time: year, way, day, period, barely, ethos, stage, reason,\n",
|
||
|
"Nearest to over: about, through, fortunately, semigroup, theremin, off, loudest, up,\n",
|
||
|
"Nearest to not: still, nor, never, they, actually, nearly, unelected, therefore,\n",
|
||
|
"Nearest to three: five, two, four, six, seven, eight, nine, zero,\n",
|
||
|
"Nearest to if: when, though, before, where, because, then, after, since,\n",
|
||
|
"Nearest to there: they, it, he, often, she, we, usually, still,\n",
|
||
|
"Nearest to between: among, with, within, from, ethical, churchmen, racial, prentice,\n",
|
||
|
"Nearest to from: through, into, within, during, under, until, between, across,\n",
|
||
|
"Nearest to state: city, atmosphere, desks, surrounding, preservation, bohr, principal, republic,\n",
|
||
|
"Nearest to on: upon, tezuka, through, within, wakefulness, catalans, at, ingeborg,\n",
|
||
|
"Nearest to and: or, but, while, including, thirst, jerzy, massing, abadan,\n",
|
||
|
"Nearest to eight: seven, six, nine, five, four, three, two, zero,\n",
|
||
|
"Nearest to they: we, you, he, there, she, it, prisons, who,\n",
|
||
|
"Nearest to more: less, most, quite, very, faster, smaller, further, larger,\n",
|
||
|
"Nearest to other: various, different, some, screenplays, lab, many, including, debugging,\n",
|
||
|
"Average loss at step 72000 : 3.41103189731\n",
|
||
|
"Average loss at step 74000 : 3.44926435578\n",
|
||
|
"Average loss at step 76000 : 3.4423020488\n",
|
||
|
"Average loss at step 78000 : 3.41976813722\n",
|
||
|
"Average loss at step 80000 : 3.39511853886\n",
|
||
|
"Nearest to been: become, be, remained, was, grown, were, prem, already,\n",
|
||
|
"Nearest to time: year, way, period, reason, barely, distance, stage, day,\n",
|
||
|
"Nearest to over: about, fortunately, through, semigroup, further, mattingly, rawlings, golden,\n",
|
||
|
"Nearest to not: still, they, nor, never, we, kiwifruit, noaa, really,\n",
|
||
|
"Nearest to three: five, two, seven, four, eight, six, nine, zero,\n",
|
||
|
"Nearest to if: when, where, though, before, since, because, although, follows,\n",
|
||
|
"Nearest to there: they, it, he, we, she, still, typically, actually,\n",
|
||
|
"Nearest to between: with, among, within, in, racial, around, from, serapeum,\n",
|
||
|
"Nearest to from: into, through, in, within, under, using, during, towards,\n",
|
||
|
"Nearest to state: city, atmosphere, ferro, vorarlberg, surrounding, republic, madhya, national,\n",
|
||
|
"Nearest to on: upon, poll, in, from, tezuka, janis, through, within,\n",
|
||
|
"Nearest to and: or, but, including, while, s, which, thirst, although,\n",
|
||
|
"Nearest to eight: nine, seven, six, five, four, three, zero, two,\n",
|
||
|
"Nearest to they: we, you, there, he, she, it, these, not,\n",
|
||
|
"Nearest to more: less, most, smaller, very, faster, quite, rather, larger,\n",
|
||
|
"Nearest to other: various, different, joachim, including, theos, smaller, individual, screenplays,\n",
|
||
|
"Average loss at step 82000 : 3.40933967865\n",
|
||
|
"Average loss at step 84000 : 3.41618054378\n",
|
||
|
"Average loss at step 86000 : 3.31485116804\n",
|
||
|
"Average loss at step 88000 : 3.37068593091\n",
|
||
|
"Average loss at step 90000 : 3.2785516749\n",
|
||
|
"Nearest to been: become, be, was, prem, remained, grown, recently, already,\n",
|
||
|
"Nearest to time: year, way, period, day, barely, battle, buds, name,\n",
|
||
|
"Nearest to over: through, about, fortunately, off, theremin, semigroup, extraterrestrial, mattingly,\n",
|
||
|
"Nearest to not: nor, still, never, otherwise, generally, separately, gown, hydrate,\n",
|
||
|
"Nearest to three: four, five, six, two, eight, seven, nine, zero,\n",
|
||
|
"Nearest to if: when, where, before, though, because, since, then, while,\n",
|
||
|
"Nearest to there: they, it, he, we, she, still, typically, fiorello,\n",
|
||
|
"Nearest to between: with, among, within, from, churchmen, prentice, racial, panasonic,\n",
|
||
|
"Nearest to from: through, into, across, during, towards, until, at, within,\n",
|
||
|
"Nearest to state: bohr, city, atmosphere, ferro, bowmen, republic, retaliation, vorarlberg,\n",
|
||
|
"Nearest to on: upon, in, tezuka, at, during, within, via, catalans,\n",
|
||
|
"Nearest to and: or, including, but, while, like, thirst, with, schuman,\n",
|
||
|
"Nearest to eight: seven, nine, six, five, four, three, zero, two,\n",
|
||
|
"Nearest to they: we, there, he, you, she, it, prisons, these,\n",
|
||
|
"Nearest to more: less, most, very, faster, larger, quite, smaller, better,\n",
|
||
|
"Nearest to other: different, various, tamara, prosthetic, including, individual, failing, restaurants,\n",
|
||
|
"Average loss at step 92000 : 3.40355363208\n",
|
||
|
"Average loss at step 94000 : 3.35647508007\n",
|
||
|
"Average loss at step 96000 : 3.34374570692\n",
|
||
|
"Average loss at step 98000 : 3.4230104093\n",
|
||
|
"Average loss at step 100000 : 3.36909827\n",
|
||
|
"Nearest to been: become, be, grown, was, being, already, remained, prem,\n",
|
||
|
"Nearest to time: way, year, day, period, years, days, mothersbaugh, separators,\n",
|
||
|
"Nearest to over: through, about, semigroup, further, fortunately, off, into, theremin,\n",
|
||
|
"Nearest to not: never, nor, still, dieppe, really, unelected, actually, now,\n",
|
||
|
"Nearest to three: four, two, five, seven, six, eight, nine, zero,\n",
|
||
|
"Nearest to if: when, though, where, before, is, abe, then, follows,\n",
|
||
|
"Nearest to there: they, it, he, we, still, she, typically, often,\n",
|
||
|
"Nearest to between: within, with, among, churchmen, around, explores, from, reactance,\n",
|
||
|
"Nearest to from: into, through, within, across, in, between, using, workshop,\n",
|
||
|
"Nearest to state: atmosphere, bohr, national, ferro, germ, desks, city, unpaid,\n",
|
||
|
"Nearest to on: upon, in, within, tezuka, janis, batavians, about, macrocosm,\n",
|
||
|
"Nearest to and: or, but, purview, thirst, sukkot, epr, including, honesty,\n",
|
||
|
"Nearest to eight: seven, nine, six, four, five, three, zero, one,\n",
|
||
|
"Nearest to they: we, there, you, he, she, prisons, it, these,\n",
|
||
|
"Nearest to more: less, most, very, quite, faster, larger, rather, smaller,\n",
|
||
|
"Nearest to other: various, different, tamara, theos, some, cope, many, others,\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"num_steps = 100001\n",
|
||
|
"\n",
|
||
|
"with tf.Session(graph=graph) as session:\n",
|
||
|
" tf.initialize_all_variables().run()\n",
|
||
|
" print \"Initialized\"\n",
|
||
|
" average_loss = 0\n",
|
||
|
" for step in xrange(num_steps):\n",
|
||
|
" batch_data, batch_labels = generate_batch(\n",
|
||
|
" batch_size, num_skips, skip_window)\n",
|
||
|
" feed_dict = {train_dataset : batch_data, train_labels : batch_labels}\n",
|
||
|
" _, l = session.run([optimizer, loss], feed_dict=feed_dict)\n",
|
||
|
" average_loss += l\n",
|
||
|
" if step % 2000 == 0:\n",
|
||
|
" if step > 0:\n",
|
||
|
" average_loss = average_loss / 2000\n",
|
||
|
" # The average loss is an estimate of the loss over the last 2000 batches.\n",
|
||
|
" print \"Average loss at step\", step, \":\", average_loss\n",
|
||
|
" average_loss = 0\n",
|
||
|
" # note that this is expensive (~20% slowdown if computed every 500 steps)\n",
|
||
|
" if step % 10000 == 0:\n",
|
||
|
" sim = similarity.eval()\n",
|
||
|
" for i in xrange(valid_size):\n",
|
||
|
" valid_word = reverse_dictionary[valid_examples[i]]\n",
|
||
|
" top_k = 8 # number of nearest neighbors\n",
|
||
|
" nearest = (-sim[i, :]).argsort()[1:top_k+1]\n",
|
||
|
" log = \"Nearest to %s:\" % valid_word\n",
|
||
|
" for k in xrange(top_k):\n",
|
||
|
" close_word = reverse_dictionary[nearest[k]]\n",
|
||
|
" log = \"%s %s,\" % (log, close_word)\n",
|
||
|
" print log\n",
|
||
|
" final_embeddings = normalized_embeddings.eval()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
}
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": true,
|
||
|
"id": "jjJXYA_XzV79"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"num_points = 400\n",
|
||
|
"\n",
|
||
|
"tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n",
|
||
|
"two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"cellView": "both",
|
||
|
"colab": {
|
||
|
"autoexec": {
|
||
|
"startup": false,
|
||
|
"wait_interval": 0
|
||
|
},
|
||
|
"output_extras": [
|
||
|
{
|
||
|
"item_id": 1
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
"colab_type": "code",
|
||
|
"collapsed": false,
|
||
|
"executionInfo": {
|
||
|
"elapsed": 4763,
|
||
|
"status": "ok",
|
||
|
"timestamp": 1445965465525,
|
||
|
"user": {
|
||
|
"color": "#1FA15D",
|
||
|
"displayName": "Vincent Vanhoucke",
|
||
|
"isAnonymous": false,
|
||
|
"isMe": true,
|
||
|
"permissionId": "05076109866853157986",
|
||
|
"photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg",
|
||
|
"sessionId": "2f1ffade4c9f20de",
|
||
|
"userId": "102167687554210253930"
|
||
|
},
|
||
|
"user_tz": 420
|
||
|
},
|
||
|
"id": "o_e0D_UezcDe",
|
||
|
"outputId": "df22e4a5-e8ec-4e5e-d384-c6cf37c68c34"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA3MAAANpCAYAAAChBGCHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XdAldUfx/H3BdlbQEVzoyDukZaae5aZ5tbcIzUz9x5Z\njhwNNXMVztTExFHqT9Ny50hFc+ZKEVBwAbLh/v4gSXILChc/r3+69/Lc53yfewL8cM5zjsFoNBoR\nERERERERk2KW0QWIiIiIiIjI01OYExERERERMUEKcyIiIiIiIiZIYU5ERERERMQEKcyJiIiIiIiY\nIIU5ERERERERE5TmMBceHk7fvn1p2LAhb775JgEBAdy6dYvOnTtTv359unTpQnh4eHrUKiIiIiIi\nIv8wpHWfuaFDh/Lqq6/SvHlzEhISiI6OZvbs2bi4uNC9e3fmzZtHeHg4gwYNSq+aRUREREREXnpp\nGpmLiIjg4MGDNG/eHIBs2bLh4ODAtm3baNq0KQBNmzbll19+SXulIiIiIiIikiJbWt4cGBhI9uzZ\nGT58OKdOnaJ48eKMGDGC69ev4+bmBoCbmxvXr19Pl2JFREREREQkWZpG5hISEjhx4gRt2rTB398f\nGxsb5s2bl+oYg8GAwWBIU5EiIiIiIiKSWprCXK5cuciZMyelSpUCoH79+pw4cQI3NzdCQ0MBuHbt\nGtmzZ3/kedJ4256IiIiIiMhLJ03TLN3d3fHw8ODChQsULFiQvXv34unpiaenJ/7+/vTo0YM1a9ZQ\np06dR57HYDAQGhqRllIkE3N3d1D/ZmHq36xLfZu1qX+zLvVt1qb+zbrc3R2e+j1pCnMAo0ePZtCg\nQcTHx5MvXz4mTZpEYmIi/fr148cffyRPnjx89dVXaW1GRERERERE7pHmMOft7c2PP/543+sLFy5M\n66lFRERERETkIdK8abiIiIiIiIi8eApzIiIiIiIiJkhhTkRERERExAQpzImIiIiIiJgghTkRERER\nERETpDAnIiIiIiJighTmRERERERETJDCnIiIiIiIiAlSmBMRERERETFBCnMiIiIiIiImSGFORERE\nRETEBCnMiYiIiIiImCCFOREREREREROkMCciIiIiImKCFOZERERERERMkMKciIiIiIiICVKYExER\nERERMUEKcyIiIiIiIiZIYU5ERERERMQEKcyJiIiIiIiYIIU5ERERERERE6QwJyIiIiIiYoIU5kRE\nREREREyQwpyIiIiIiIgJUpgTERERERExQQpzIiIiIiIiJkhhTkRERERExAQpzImIiIiIiJgghTkR\nERERERETpDAnIiIiIiJighTmRERERERETJDCnIiIiIiIiAlSmBMRERERETFBCnMiIiIiIiImSGFO\nRERERETEBCnMiYiIiIiImCCFOREREREREROkMCciIiIiImKCFOZERERERERMkMKciIiIiIiICVKY\nExERERERMUEKcyIiIiIiIiZIYU5ERERERMQEKcyJiIiIiIiYIIU5ERERERERE6QwJyIiIiIiYoIU\n5kREREREREyQwpyIiIiIiIgJUpgTERERERExQQpzIiIiIiIiJkhhTkRERERExAQpzImIiIiIiJgg\nhTkRERERERETpDAnIiIiIiJighTmRERERERETJDCnIiIiIiIiAlSmBMRERERETFBCnMiIiIiIiIm\nSGFORERERETEBCnMiYiIiIiImCCFOREREREREROkMCciIiIiImKCFOZERERERERMkMKciIiIiIiI\nCVKYExERERERMUEKcyIiIiIiIiZIYU5ERERERMQEKcyJiIiIiIiYIIU5ERERERERE6QwJyIiIiIi\nYoIU5kREREREREyQwpyIiIiIiIgJUpgTERERERExQQpzIiIiIiIiJkhhTkRERERExAQpzImIiIiI\niJgghTkRERERERETpDAnIiIiIiJighTmRERERERETJDCnIiIiIiIiAlSmBMRERERETFBCnMiIiIi\nIiImSGFORERERETEBCnMiYiIiIiImCCFOREREREREROULaMLEBERkcxnxYqlbNiwHoBGjZpQrVoN\nBg78kFKlyvLnnwG4u+dg0qTPsbKy4sqVQL74Ygq3bt3E2tqaoUNHki9fgYy9ABGRl4BG5kRERCSV\nU6dOsnHjT8yfv4i5cxeyfr0/ERHhBAZeplmzlixZshJ7ewe2b98GwJQpE+jffzDffbeE3r0/4vPP\nJ2fwFYiIvBw0MiciIiKpHD16hGrVamJlZQ1A9eq1CAg4jIdHHjw9iwDg5eVNcHAQ0dHRHDt2lNGj\nh6a8Pz4+IUPqFhF52SjMiYiISCoGg+GBr1taWqQ8NjMzJykpDqMxCQcHBxYsWPaiyhMRkX9omqWI\niIikUrp0GXbs+I3Y2Biio6PZseNXSpcue99xRqMRW1s7cufOza+//pLy2tmzf73okkVEXkoamRMR\nEZFUihb15s03G9G9e0cA3n67KQ4OjveN2N19PmbMeKZN+4xFi3xJSEigTp16KdMxRUTk+TEYjUZj\nRhcBEBoakdElyHPi7u6g/s3C1L9Zl/o2a1P/Zl3q26xN/Zt1ubs7PPV7NM1SREREntnx4+eYP38j\nf/xxMqNLERF56SjMiYiIyDNZt24/LVpEM3JkC1q1smLhwu0ZXZKIyEtFYU5ERESeyeLFNwgLex0w\nEB5emqVLYzK6JBGRl4rCnIiIiDwTo9HwyOciIvJ8KcyJiIjIM2nd2gEXl0MA2NmdpkULLZItIvIi\n6aeuiIiIPJMWLSpToMAJ9u3zo2TJXFSvXiujSxIReakozImIiMgze/VVH1591SejyxAReSlpmqWI\niIiIiIgJUpgTERERERExQQpzIiIiIiIiJkhhTkRERERExAQpzImIiGSAyMhI/P1XZXQZIiJiwhTm\nREREMkBERDj+/n4ZXYaIiJgwbU0gIiKSAebMmcmVK4F07tyWIkW8qFatJlWrVmP48EE4OjoyfPgY\nfvppLUFBV+jRozcrVixlw4b1ADRq1ISWLdtk8BWIiEhG08iciIhIBujVqy958rzCggXLqFTpdY4e\nPQxAWNg1/v77IgBHjx6hbNlynDp1ko0bf2L+/EXMnbuQ9ev9+euv0xlYvYiIZAYKcyIiIhnAaDSm\nPC5VqgwBAUe4ePECBQsWxsUlO9evh3H8+DFKlCjN0aNHqFatJlZW1tjY2FC9ei0CAg5nYPUiIpIZ\naJqliIhIBnN3z0FkZAT79u2hdOmyhIeHs3XrFmxtbbGxscFgMKQ63mg03veaiIi8fDQyJyIikgFs\nbW2JiopKeV68eElWrlxOmTLlKF26DCtWLKVUqbIAlC5dhh07fiM2Nobo6Gh27vwt5WsiIvLy0sic\niIhIBnBycqZkydJ06NCK116rTKlSZThwYB958rxCzpy5iIgIp3Tp5MBWtKg3b77ZiO7dOwLw9ttN\nKVKkaEaWLyIimYDBeO+k/QwUGhqR0SXIc+Lu7qD+zcLUv1mX+jbzuHr1KsHBoXh7e2JtbZ0u51T/\nZl3q26xN/Zt1ubs7PPV7NM1SREQkE/P13U61apepV8+DRo02cflySEaXJCIimYTCnIiISCYVHx/P\nrFnx3LxZEyjA0aPt+eKLgxldloiIZBIKcyIiIplUXFwcUVH2qV6LibHMoGpERCSzUZgTERHJpOzs\n7Kha9W8gGgBHx8M0bOiUsUWJiEimodUsRUREMrHZs5tRvPhPhIVBzZq5qF27UkaXJCIimYTCnIiI\nSCaWLVs2+vVrkNFliIhIJqRpliIiIiIiIiZIYU5ERERERMQEKcyJiIjIA02ePJ6LFy9kdBkiIvIQ\numdOREREHmjo0FEZXYKIiDyCwpyIiMhLIDo6mjFjhhEaGkpSUiIfftgHBwc3vv76S6Kjo3FwcMBo\nNBIaGsrVq8GMGPEx/v5+tG79Ht98M4OkpCQGDRrGwoXfcvr0SfLnL8DkyV/i6upGnz49KF68JIcO\nHSQyMoJhw8ZQunSZjL5
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x2ee65e10>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def plot(embeddings, labels):\n",
|
||
|
" assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'\n",
|
||
|
" pylab.figure(figsize=(15,15)) # in inches\n",
|
||
|
" for i, label in enumerate(labels):\n",
|
||
|
" x, y = embeddings[i,:]\n",
|
||
|
" pylab.scatter(x, y)\n",
|
||
|
" pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',\n",
|
||
|
" ha='right', va='bottom')\n",
|
||
|
" pylab.show()\n",
|
||
|
"\n",
|
||
|
"words = [reverse_dictionary[i] for i in xrange(1, num_points+1)]\n",
|
||
|
"plot(two_d_embeddings, words)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"colabVersion": "0.3.2",
|
||
|
"colab_default_view": {},
|
||
|
"colab_views": {},
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.4.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|