mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added snippets to demonstrate writing and running a Spark app.
This commit is contained in:
parent
9fd62a73ae
commit
1c4e2157a6
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"metadata": {
|
||||
"name": "",
|
||||
"signature": "sha256:b82a9fa9d896b3d5d681d6fee5ca75e340c9cd86d6292aa63c6f6bff1fb5ea20"
|
||||
"signature": "sha256:90f5d3daae548071bd0d43ab169b91ef919dded25d9df7effad349b4ec3641bb"
|
||||
},
|
||||
"nbformat": 3,
|
||||
"nbformat_minor": 0,
|
||||
|
@ -21,7 +21,8 @@
|
|||
"* Viewing the Spark Application UI\n",
|
||||
"* Working with Partitions\n",
|
||||
"* Caching RDDs\n",
|
||||
"* Checkpointing RDDs"
|
||||
"* Checkpointing RDDs\n",
|
||||
"* Writing and Running a Spark Application"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -660,6 +661,63 @@
|
|||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Writing and Running a Spark Application"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create a Spark application to count the number of text files:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"import sys\n",
|
||||
"from pyspark import SparkContext\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" if len(sys.argv) < 2:\n",
|
||||
" print >> sys.stderr, \"Usage: App Name <file>\"\n",
|
||||
" exit(-1)\n",
|
||||
" \n",
|
||||
" count_text_files()\n",
|
||||
" \n",
|
||||
"def count_text_files():\n",
|
||||
" sc = SparkContext()\n",
|
||||
" logfile = sys.argv[1]\n",
|
||||
" text_files_count = sc.textFile(logfile)\n",
|
||||
" .filter(lambda line: '.txt' in line)\n",
|
||||
" text_files_count.cache()\n",
|
||||
" print(\"Number of text files: \", text_files_count.count())"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Submit the script to Spark for processing:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"spark-submit --properties-file dir/myspark.conf script.py data/*"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user