data-science-ipython-notebooks/data/titanic/gendermodel.py

""" This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this
Author : AstroDave
Date : 18 September 2012
Revised: 28 March 2014

"""


import csv as csv
import numpy as np

csv_file_object = csv.reader(open('train.csv', 'rb')) 	# Load in the csv file
header = csv_file_object.next() 						# Skip the fist line as it is a header
data=[] 												# Create a variable to hold the data

for row in csv_file_object: 							# Skip through each row in the csv file,
    data.append(row[0:]) 								# adding each row to the data variable
data = np.array(data) 									# Then convert from a list to an array.

# Now I have an array of 12 columns and 891 rows
# I can access any element I want, so the entire first column would
# be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0
# I have to add the .astype() command, because
# when appending the rows, python thought it was a string - so needed to convert

# Set some variables
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers 

# I can now find the stats of all the women on board,
# by making an array that lists True/False whether each row is female
women_only_stats = data[0::,4] == "female" 	# This finds where all the women are
men_only_stats = data[0::,4] != "female" 	# This finds where all the men are (note != means 'not equal')

# I can now filter the whole data, to find statistics for just women, by just placing
# women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 
# You can test it by placing it there, and requesting column index [4], and the output should all read 'female'
# e.g. try typing this:   data[women_only_stats,4]
women_onboard = data[women_only_stats,1].astype(np.float)
men_onboard = data[men_only_stats,1].astype(np.float)

# and derive some statistics about them
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)

print 'Proportion of women who survived is %s' % proportion_women_survived
print 'Proportion of men who survived is %s' % proportion_men_survived

# Now that I have my indicator that women were much more likely to survive,
# I am done with the training set.
# Now I will read in the test file and write out my simplistic prediction:
# if female, then model that she survived (1) 
# if male, then model that he did not survive (0)

# First, read in test.csv
test_file = open('test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()

# Also open the a new file so I can write to it. Call it something descriptive
# Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex')
# Write out the PassengerId, and my prediction.

predictions_file = open("gendermodel.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["PassengerId", "Survived"])	# write the column headers
for row in test_file_object:									# For each row in test file,
    if row[3] == 'female':										# is it a female, if yes then
        predictions_file_object.writerow([row[0], "1"])			# write the PassengerId, and predict 1
    else:														# or else if male,
        predictions_file_object.writerow([row[0], "0"])			# write the PassengerId, and predict 0.
test_file.close()												# Close out the files.
predictions_file.close()
Added Kaggle Titanic data files. 2015-03-15 07:49:07 +08:00			`""" This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this`
			`Author : AstroDave`
			`Date : 18 September 2012`
			`Revised: 28 March 2014`

			`"""`


			`import csv as csv`
			`import numpy as np`

			`csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file`
			`header = csv_file_object.next() # Skip the fist line as it is a header`
			`data=[] # Create a variable to hold the data`

			`for row in csv_file_object: # Skip through each row in the csv file,`
			`data.append(row[0:]) # adding each row to the data variable`
			`data = np.array(data) # Then convert from a list to an array.`

			`# Now I have an array of 12 columns and 891 rows`
			`# I can access any element I want, so the entire first column would`
			`# be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0`
			`# I have to add the .astype() command, because`
			`# when appending the rows, python thought it was a string - so needed to convert`

			`# Set some variables`
			`number_passengers = np.size(data[0::,1].astype(np.float))`
			`number_survived = np.sum(data[0::,1].astype(np.float))`
			`proportion_survivors = number_survived / number_passengers`

			`# I can now find the stats of all the women on board,`
			`# by making an array that lists True/False whether each row is female`
			`women_only_stats = data[0::,4] == "female" # This finds where all the women are`
			`men_only_stats = data[0::,4] != "female" # This finds where all the men are (note != means 'not equal')`

			`# I can now filter the whole data, to find statistics for just women, by just placing`
			`# women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index.`
			`# You can test it by placing it there, and requesting column index [4], and the output should all read 'female'`
			`# e.g. try typing this: data[women_only_stats,4]`
			`women_onboard = data[women_only_stats,1].astype(np.float)`
			`men_onboard = data[men_only_stats,1].astype(np.float)`

			`# and derive some statistics about them`
			`proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)`
			`proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)`

			`print 'Proportion of women who survived is %s' % proportion_women_survived`
			`print 'Proportion of men who survived is %s' % proportion_men_survived`

			`# Now that I have my indicator that women were much more likely to survive,`
			`# I am done with the training set.`
			`# Now I will read in the test file and write out my simplistic prediction:`
			`# if female, then model that she survived (1)`
			`# if male, then model that he did not survive (0)`

			`# First, read in test.csv`
			`test_file = open('test.csv', 'rb')`
			`test_file_object = csv.reader(test_file)`
			`header = test_file_object.next()`

			`# Also open the a new file so I can write to it. Call it something descriptive`
			`# Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex')`
			`# Write out the PassengerId, and my prediction.`

			`predictions_file = open("gendermodel.csv", "wb")`
			`predictions_file_object = csv.writer(predictions_file)`
			`predictions_file_object.writerow(["PassengerId", "Survived"]) # write the column headers`
			`for row in test_file_object: # For each row in test file,`
			`if row[3] == 'female': # is it a female, if yes then`
			`predictions_file_object.writerow([row[0], "1"]) # write the PassengerId, and predict 1`
			`else: # or else if male,`
			`predictions_file_object.writerow([row[0], "0"]) # write the PassengerId, and predict 0.`
			`test_file.close() # Close out the files.`
			`predictions_file.close()`