data-science-ipython-notebooks/spark/spark.ipynb

{
 "metadata": {
  "name": "",
  "signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Spark\n",
      "\n",
      "* Python Shell\n",
      "* RDDs"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Python Shell"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Start the pyspark shell:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pyspark"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "View the spark context:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sc"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## RDDs"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create an RDD from the contents of a directory:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data = sc.textFile(\"file:/path/*\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Count the number of lines in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.count()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Display the data in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.collect()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Return the first 10 lines in the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.take(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create an RDD with lines matching the given filter:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "my_data.filter(lambda line: \".txt\" in line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Chain a series of commands:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sc.textFile(\"file:/path/file.txt\") \\\n",
      "    .filter(lambda line: \".txt\" in line) \\\n",
      "    .count()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "first_words = my_data.map(lambda line: line.split()[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Output each word in first_words:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for word in first_words.take(10):\n",
      "    print word"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Save the first words to a text file:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "first_words.saveAsTextFile(\"file:/path/file\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
Added Spark IPython Notebook, currently contains snippets for starting the pyspark shell and viewing the spark context. 2015-03-03 23:32:59 +08:00			`{`
			`"metadata": {`
			`"name": "",`
Added snippets for basic RDD operations. 2015-03-03 23:36:30 +08:00			`"signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"`
Added Spark IPython Notebook, currently contains snippets for starting the pyspark shell and viewing the spark context. 2015-03-03 23:32:59 +08:00			`},`
			`"nbformat": 3,`
			`"nbformat_minor": 0,`
			`"worksheets": [`
			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# Spark\n",`
			`"\n",`
Added snippets for basic RDD operations. 2015-03-03 23:36:30 +08:00			`"* Python Shell\n",`
			`"* RDDs"`
Added Spark IPython Notebook, currently contains snippets for starting the pyspark shell and viewing the spark context. 2015-03-03 23:32:59 +08:00			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## Python Shell"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Start the pyspark shell:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"pyspark"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"View the spark context:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"sc"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
Added snippets for basic RDD operations. 2015-03-03 23:36:30 +08:00			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## RDDs"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Create an RDD from the contents of a directory:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"my_data = sc.textFile(\"file:/path/*\")"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Count the number of lines in the data:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"my_data.count()"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Display the data in the data:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"my_data.collect()"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Return the first 10 lines in the data:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"my_data.take(10)"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Create an RDD with lines matching the given filter:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"my_data.filter(lambda line: \".txt\" in line)"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Chain a series of commands:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"sc.textFile(\"file:/path/file.txt\") \\\n",`
			`" .filter(lambda line: \".txt\" in line) \\\n",`
			`" .count()"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Create a new RDD mapping each line to an array of words, taking only the first word of each array:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"first_words = my_data.map(lambda line: line.split()[0])"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Output each word in first_words:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"for word in first_words.take(10):\n",`
			`" print word"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Save the first words to a text file:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"collapsed": false,`
			`"input": [`
			`"first_words.saveAsTextFile(\"file:/path/file\")"`
			`],`
			`"language": "python",`
			`"metadata": {},`
			`"outputs": []`
Added Spark IPython Notebook, currently contains snippets for starting the pyspark shell and viewing the spark context. 2015-03-03 23:32:59 +08:00			`}`
			`],`
			`"metadata": {}`
			`}`
			`]`
			`}`