Added snippets for basic RDD operations.

This commit is contained in:
Donne Martin 2015-03-03 10:36:30 -05:00
parent 6c7e7b5239
commit e8b481f480

View File

@ -1,7 +1,7 @@
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:91816797cb89d9542e7443d05f797ce60e70a74be3a870823401ce7b3e7c3313" "signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
@ -14,7 +14,8 @@
"source": [ "source": [
"# Spark\n", "# Spark\n",
"\n", "\n",
"* Python Shell" "* Python Shell\n",
"* RDDs"
] ]
}, },
{ {
@ -57,6 +58,169 @@
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RDDs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an RDD from the contents of a directory:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data = sc.textFile(\"file:/path/*\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count the number of lines in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Display the data in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.collect()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Return the first 10 lines in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.take(10)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an RDD with lines matching the given filter:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.filter(lambda line: \".txt\" in line)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Chain a series of commands:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sc.textFile(\"file:/path/file.txt\") \\\n",
" .filter(lambda line: \".txt\" in line) \\\n",
" .count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"first_words = my_data.map(lambda line: line.split()[0])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Output each word in first_words:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for word in first_words.take(10):\n",
" print word"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save the first words to a text file:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"first_words.saveAsTextFile(\"file:/path/file\")"
],
"language": "python",
"metadata": {},
"outputs": []
} }
], ],
"metadata": {} "metadata": {}