Added snippets to demonstrate writing and running a Spark app.

2024-03-22 13:30:56 +08:00 · 2015-03-12 06:25:40 -04:00 · 2015-03-12 06:25:40 -04:00 · 1c4e2157a6
commit 1c4e2157a6
parent 9fd62a73ae
1 changed files with 60 additions and 2 deletions
--- a/spark/spark.ipynb
+++ b/spark/spark.ipynb
@ -1,7 +1,7 @@
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:b82a9fa9d896b3d5d681d6fee5ca75e340c9cd86d6292aa63c6f6bff1fb5ea20"
+  "signature": "sha256:90f5d3daae548071bd0d43ab169b91ef919dded25d9df7effad349b4ec3641bb"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@ -21,7 +21,8 @@
      "* Viewing the Spark Application UI\n",
      "* Working with Partitions\n",
      "* Caching RDDs\n",
-      "* Checkpointing RDDs"
+      "* Checkpointing RDDs\n",
+      "* Writing and Running a Spark Application"
     ]
    },
    {
@ -660,6 +661,63 @@
     "language": "python",
     "metadata": {},
     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Writing and Running a Spark Application"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create a Spark application to count the number of text files:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import sys\n",
+      "from pyspark import SparkContext\n",
+      "\n",
+      "if __name__ == \"__main__\":\n",
+      "    if len(sys.argv) < 2:\n",
+      "        print >> sys.stderr, \"Usage: App Name <file>\"\n",
+      "        exit(-1)\n",
+      "        \n",
+      "    count_text_files()\n",
+      "    \n",
+      "def count_text_files():\n",
+      "    sc = SparkContext()\n",
+      "    logfile = sys.argv[1]\n",
+      "    text_files_count = sc.textFile(logfile)\n",
+      "        .filter(lambda line: '.txt' in line)\n",
+      "    text_files_count.cache()\n",
+      "    print(\"Number of text files: \", text_files_count.count())"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Submit the script to Spark for processing:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "spark-submit --properties-file dir/myspark.conf script.py data/*"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
    }
   ],
   "metadata": {}