Added Spark accumulators snippets.

This commit is contained in:
Donne Martin 2015-03-31 21:41:21 -04:00
parent 7195c5bc82
commit 011747c17a

View File

@ -1,7 +1,7 @@
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:cf2651f340403267ecf82871c01a4f9deb80df25b8345be4f56dc02db970a5a7" "signature": "sha256:f13a3255902ddadfe3c0b99e722529cfe44804d81633f80f0f7681afcf3acb8a"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
@ -26,7 +26,8 @@
"* Configuring Spark Applications\n", "* Configuring Spark Applications\n",
"* Streaming\n", "* Streaming\n",
"* Streaming with States\n", "* Streaming with States\n",
"* Broadcast Variables" "* Broadcast Variables\n",
"* Accumulators"
] ]
}, },
{ {
@ -1143,6 +1144,82 @@
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Accumulators"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an accumulator:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"txt_count = sc.accumulator(0)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count the number of txt files in the RDD:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data = sc.textFile(filePath)\n",
"my_data.foreach(lambda line: if '.txt' in line: txt_count.add(1))"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count the number of file types encountered:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"jpg_count = sc.accumulator(0)\n",
"html_count = sc.accumulator(0)\n",
"css_count = sc.accumulator(0)\n",
"\n",
"def countFileType(s):\n",
" if '.jpg' in s: jpg_count.add(1)\n",
" elif '.html' in s: html_count.add(1)\n",
" elif '.css' in s: css_count.add(1)\n",
"\n",
"filename=\"hdfs://logs/*\"\n",
"\n",
"logs = sc.textFile(filename)\n",
"logs.foreach(lambda line: countFileType(line))\n",
"\n",
"print 'File Type Totals:'\n",
"print '.css files: ', css_count.value\n",
"print '.html files: ', html_count.value\n",
"print '.jpg files: ', jpg_count.value"
],
"language": "python",
"metadata": {},
"outputs": []
} }
], ],
"metadata": {} "metadata": {}