Added snippets to demonstrate writing and running a Spark app.

This commit is contained in:
Donne Martin 2015-03-12 06:25:40 -04:00
parent 9fd62a73ae
commit 1c4e2157a6

View File

@ -1,7 +1,7 @@
{
"metadata": {
"name": "",
"signature": "sha256:b82a9fa9d896b3d5d681d6fee5ca75e340c9cd86d6292aa63c6f6bff1fb5ea20"
"signature": "sha256:90f5d3daae548071bd0d43ab169b91ef919dded25d9df7effad349b4ec3641bb"
},
"nbformat": 3,
"nbformat_minor": 0,
@ -21,7 +21,8 @@
"* Viewing the Spark Application UI\n",
"* Working with Partitions\n",
"* Caching RDDs\n",
"* Checkpointing RDDs"
"* Checkpointing RDDs\n",
"* Writing and Running a Spark Application"
]
},
{
@ -660,6 +661,63 @@
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Writing and Running a Spark Application"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a Spark application to count the number of text files:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import sys\n",
"from pyspark import SparkContext\n",
"\n",
"if __name__ == \"__main__\":\n",
" if len(sys.argv) < 2:\n",
" print >> sys.stderr, \"Usage: App Name <file>\"\n",
" exit(-1)\n",
" \n",
" count_text_files()\n",
" \n",
"def count_text_files():\n",
" sc = SparkContext()\n",
" logfile = sys.argv[1]\n",
" text_files_count = sc.textFile(logfile)\n",
" .filter(lambda line: '.txt' in line)\n",
" text_files_count.cache()\n",
" print(\"Number of text files: \", text_files_count.count())"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submit the script to Spark for processing:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"spark-submit --properties-file dir/myspark.conf script.py data/*"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}