diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 241650e..82caf9a 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:cf2651f340403267ecf82871c01a4f9deb80df25b8345be4f56dc02db970a5a7" + "signature": "sha256:f13a3255902ddadfe3c0b99e722529cfe44804d81633f80f0f7681afcf3acb8a" }, "nbformat": 3, "nbformat_minor": 0, @@ -26,7 +26,8 @@ "* Configuring Spark Applications\n", "* Streaming\n", "* Streaming with States\n", - "* Broadcast Variables" + "* Broadcast Variables\n", + "* Accumulators" ] }, { @@ -1143,6 +1144,82 @@ "language": "python", "metadata": {}, "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accumulators" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an accumulator:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "txt_count = sc.accumulator(0)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Count the number of txt files in the RDD:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_data = sc.textFile(filePath)\n", + "my_data.foreach(lambda line: if '.txt' in line: txt_count.add(1))" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Count the number of file types encountered:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "jpg_count = sc.accumulator(0)\n", + "html_count = sc.accumulator(0)\n", + "css_count = sc.accumulator(0)\n", + "\n", + "def countFileType(s):\n", + " if '.jpg' in s: jpg_count.add(1)\n", + " elif '.html' in s: html_count.add(1)\n", + " elif '.css' in s: css_count.add(1)\n", + "\n", + "filename=\"hdfs://logs/*\"\n", + "\n", + "logs = sc.textFile(filename)\n", + "logs.foreach(lambda line: countFileType(line))\n", + "\n", + "print 'File Type Totals:'\n", + "print '.css files: ', css_count.value\n", + "print '.html files: ', html_count.value\n", + "print '.jpg files: ', jpg_count.value" + ], + "language": "python", + "metadata": {}, + "outputs": [] } ], "metadata": {}