data-science-ipython-notebooks/spark/spark.ipynb

229 lines
4.2 KiB
Plaintext
Raw Normal View History

{
"metadata": {
"name": "",
"signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Spark\n",
"\n",
"* Python Shell\n",
"* RDDs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Python Shell"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Start the pyspark shell:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pyspark"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the spark context:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sc"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RDDs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an RDD from the contents of a directory:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data = sc.textFile(\"file:/path/*\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count the number of lines in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Display the data in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.collect()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Return the first 10 lines in the data:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.take(10)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an RDD with lines matching the given filter:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_data.filter(lambda line: \".txt\" in line)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Chain a series of commands:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sc.textFile(\"file:/path/file.txt\") \\\n",
" .filter(lambda line: \".txt\" in line) \\\n",
" .count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"first_words = my_data.map(lambda line: line.split()[0])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Output each word in first_words:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for word in first_words.take(10):\n",
" print word"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save the first words to a text file:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"first_words.saveAsTextFile(\"file:/path/file\")"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}