mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added Spark broadcast variables snippets.
This commit is contained in:
parent
b3fb4ae219
commit
7195c5bc82
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"name": "",
|
"name": "",
|
||||||
"signature": "sha256:0fb24b809c8caadd2f68f7898036387e846da3f839a97e63be9a65aa17f3dbb1"
|
"signature": "sha256:cf2651f340403267ecf82871c01a4f9deb80df25b8345be4f56dc02db970a5a7"
|
||||||
},
|
},
|
||||||
"nbformat": 3,
|
"nbformat": 3,
|
||||||
"nbformat_minor": 0,
|
"nbformat_minor": 0,
|
||||||
|
@ -25,7 +25,8 @@
|
||||||
"* Writing and Running a Spark Application\n",
|
"* Writing and Running a Spark Application\n",
|
||||||
"* Configuring Spark Applications\n",
|
"* Configuring Spark Applications\n",
|
||||||
"* Streaming\n",
|
"* Streaming\n",
|
||||||
"* Streaming with States"
|
"* Streaming with States\n",
|
||||||
|
"* Broadcast Variables"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1080,18 +1081,65 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": []
|
"outputs": []
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Broadcast Variables"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Read in list of items to broadcast from a local file:"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"input": [],
|
"input": [
|
||||||
|
"broadcast_file = \"broadcast.txt\"\n",
|
||||||
|
"broadcast_list = list(map(lambda l: l.strip(), open(broadcast_file)))"
|
||||||
|
],
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": []
|
"outputs": []
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Broadcast the target list to all workers:"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"input": [],
|
"input": [
|
||||||
|
"broadcast_list_sc = sc.broadcast(broadcast_list)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Filter based on the broadcast list:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"log_file = \"hdfs://localhost/user/logs/*\"\n",
|
||||||
|
"filtered_data = sc.textFile(log_file)\\\n",
|
||||||
|
" .filter(lambda line: any(item in line for item in broadcast_list_sc.value))\n",
|
||||||
|
"\n",
|
||||||
|
"filtered_data.take(10)"
|
||||||
|
],
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": []
|
"outputs": []
|
||||||
|
|
Loading…
Reference in New Issue
Block a user