diff --git a/spark/spark.ipynb b/spark/spark.ipynb index 2abc97c..9387c7f 100644 --- a/spark/spark.ipynb +++ b/spark/spark.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:b82a9fa9d896b3d5d681d6fee5ca75e340c9cd86d6292aa63c6f6bff1fb5ea20" + "signature": "sha256:90f5d3daae548071bd0d43ab169b91ef919dded25d9df7effad349b4ec3641bb" }, "nbformat": 3, "nbformat_minor": 0, @@ -21,7 +21,8 @@ "* Viewing the Spark Application UI\n", "* Working with Partitions\n", "* Caching RDDs\n", - "* Checkpointing RDDs" + "* Checkpointing RDDs\n", + "* Writing and Running a Spark Application" ] }, { @@ -660,6 +661,63 @@ "language": "python", "metadata": {}, "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Writing and Running a Spark Application" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a Spark application to count the number of text files:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import sys\n", + "from pyspark import SparkContext\n", + "\n", + "if __name__ == \"__main__\":\n", + " if len(sys.argv) < 2:\n", + " print >> sys.stderr, \"Usage: App Name \"\n", + " exit(-1)\n", + " \n", + " count_text_files()\n", + " \n", + "def count_text_files():\n", + " sc = SparkContext()\n", + " logfile = sys.argv[1]\n", + " text_files_count = sc.textFile(logfile)\n", + " .filter(lambda line: '.txt' in line)\n", + " text_files_count.cache()\n", + " print(\"Number of text files: \", text_files_count.count())" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the script to Spark for processing:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spark-submit --properties-file dir/myspark.conf script.py data/*" + ], + "language": "python", + "metadata": {}, + "outputs": [] } ], "metadata": {}