diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 7ef6543..d9f6c5f 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:91816797cb89d9542e7443d05f797ce60e70a74be3a870823401ce7b3e7c3313" + "signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69" }, "nbformat": 3, "nbformat_minor": 0, @@ -14,7 +14,8 @@ "source": [ "# Spark\n", "\n", - "* Python Shell" + "* Python Shell\n", + "* RDDs" ] }, { @@ -57,6 +58,169 @@ "language": "python", "metadata": {}, "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RDDs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an RDD from the contents of a directory:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data = sc.textFile(\"file:/path/*\")" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Count the number of lines in the data:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data.count()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display the data in the data:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data.collect()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Return the first 10 lines in the data:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data.take(10)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an RDD with lines matching the given filter:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data.filter(lambda line: \".txt\" in line)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Chain a series of commands:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "sc.textFile(\"file:/path/file.txt\") \\\n", + " .filter(lambda line: \".txt\" in line) \\\n", + " .count()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a new RDD mapping each line to an array of words, taking only the first word of each array:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "first_words = my_data.map(lambda line: line.split()[0])" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Output each word in first_words:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for word in first_words.take(10):\n", + " print word" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save the first words to a text file:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "first_words.saveAsTextFile(\"file:/path/file\")" + ], + "language": "python", + "metadata": {}, + "outputs": [] } ], "metadata": {}