Added snippets for basic RDD operations.

2024-03-22 13:30:56 +08:00 · 2015-03-03 10:36:30 -05:00 · 2015-03-03 10:36:30 -05:00 · e8b481f480
commit e8b481f480
parent 6c7e7b5239
1 changed files with 166 additions and 2 deletions
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@ -1,7 +1,7 @@
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:91816797cb89d9542e7443d05f797ce60e70a74be3a870823401ce7b3e7c3313"
+  "signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@ -14,7 +14,8 @@
     "source": [
      "# Spark\n",
      "\n",
-      "* Python Shell"
+      "* Python Shell\n",
      "* RDDs"
     ]
    },
    {
@ -57,6 +58,169 @@
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## RDDs"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create an RDD from the contents of a directory:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data = sc.textFile(\"file:/path/*\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Count the number of lines in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.count()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Display the data in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.collect()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Return the first 10 lines in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.take(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create an RDD with lines matching the given filter:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.filter(lambda line: \".txt\" in line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Chain a series of commands:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sc.textFile(\"file:/path/file.txt\") \\\n",
      "    .filter(lambda line: \".txt\" in line) \\\n",
      "    .count()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "first_words = my_data.map(lambda line: line.split()[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Output each word in first_words:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for word in first_words.take(10):\n",
      "    print word"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Save the first words to a text file:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "first_words.saveAsTextFile(\"file:/path/file\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}