Added snippets for basic RDD operations.

2024-03-22 13:30:56 +08:00 · 2015-03-03 10:36:30 -05:00 · 2015-03-03 10:36:30 -05:00 · e8b481f480
commit e8b481f480
parent 6c7e7b5239
1 changed files with 166 additions and 2 deletions
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@ -1,7 +1,7 @@
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:91816797cb89d9542e7443d05f797ce60e70a74be3a870823401ce7b3e7c3313"
+  "signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@ -14,7 +14,8 @@
     "source": [
      "# Spark\n",
      "\n",
-      "* Python Shell"
+      "* Python Shell\n",
+      "* RDDs"
     ]
    },
    {
@ -57,6 +58,169 @@
     "language": "python",
     "metadata": {},
     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## RDDs"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create an RDD from the contents of a directory:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_data = sc.textFile(\"file:/path/*\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Count the number of lines in the data:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_data.count()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Display the data in the data:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_data.collect()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Return the first 10 lines in the data:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_data.take(10)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create an RDD with lines matching the given filter:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_data.filter(lambda line: \".txt\" in line)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Chain a series of commands:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "sc.textFile(\"file:/path/file.txt\") \\\n",
+      "    .filter(lambda line: \".txt\" in line) \\\n",
+      "    .count()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "first_words = my_data.map(lambda line: line.split()[0])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Output each word in first_words:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for word in first_words.take(10):\n",
+      "    print word"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Save the first words to a text file:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "first_words.saveAsTextFile(\"file:/path/file\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
    }
   ],
   "metadata": {}