mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added Spark accumulators snippets.
This commit is contained in:
parent
7195c5bc82
commit
011747c17a
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"name": "",
|
"name": "",
|
||||||
"signature": "sha256:cf2651f340403267ecf82871c01a4f9deb80df25b8345be4f56dc02db970a5a7"
|
"signature": "sha256:f13a3255902ddadfe3c0b99e722529cfe44804d81633f80f0f7681afcf3acb8a"
|
||||||
},
|
},
|
||||||
"nbformat": 3,
|
"nbformat": 3,
|
||||||
"nbformat_minor": 0,
|
"nbformat_minor": 0,
|
||||||
|
@ -26,7 +26,8 @@
|
||||||
"* Configuring Spark Applications\n",
|
"* Configuring Spark Applications\n",
|
||||||
"* Streaming\n",
|
"* Streaming\n",
|
||||||
"* Streaming with States\n",
|
"* Streaming with States\n",
|
||||||
"* Broadcast Variables"
|
"* Broadcast Variables\n",
|
||||||
|
"* Accumulators"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1143,6 +1144,82 @@
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": []
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Accumulators"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Create an accumulator:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"txt_count = sc.accumulator(0)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Count the number of txt files in the RDD:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"my_data = sc.textFile(filePath)\n",
|
||||||
|
"my_data.foreach(lambda line: if '.txt' in line: txt_count.add(1))"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Count the number of file types encountered:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"jpg_count = sc.accumulator(0)\n",
|
||||||
|
"html_count = sc.accumulator(0)\n",
|
||||||
|
"css_count = sc.accumulator(0)\n",
|
||||||
|
"\n",
|
||||||
|
"def countFileType(s):\n",
|
||||||
|
" if '.jpg' in s: jpg_count.add(1)\n",
|
||||||
|
" elif '.html' in s: html_count.add(1)\n",
|
||||||
|
" elif '.css' in s: css_count.add(1)\n",
|
||||||
|
"\n",
|
||||||
|
"filename=\"hdfs://logs/*\"\n",
|
||||||
|
"\n",
|
||||||
|
"logs = sc.textFile(filename)\n",
|
||||||
|
"logs.foreach(lambda line: countFileType(line))\n",
|
||||||
|
"\n",
|
||||||
|
"print 'File Type Totals:'\n",
|
||||||
|
"print '.css files: ', css_count.value\n",
|
||||||
|
"print '.html files: ', html_count.value\n",
|
||||||
|
"print '.jpg files: ', jpg_count.value"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {}
|
"metadata": {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user