import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('kaggle-titanic').getOrCreate()

# Download train and test datasets from my github repo: https://github.com/himoacs/kaggle/tree/master/titanic
# Upload them into databricks so you can easily load them into a DataFrame
train_df = spark.read.format('csv').options(header='true', 
                                            inferSchema='true').load('/Users/himanshugupta/kaggle/titanic/train.csv')

train_df.show()

# Here is the description of each column provided by Kaggle
# PassengerId - Unique id for the passenger
# Survived - Whether the passenger survived or not
# Pclass - Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
# Name - Passenger's name
# Sex - Passenger's sex
# Age - Passenger's age
# SibSp - Number of siblings / spouses aboard the Titanic
# Parch - Number of parents / children aboard the Titanic
# Ticket - Ticket number
# Fare - Ticket fare
# Cabin - Cabin number
# Embarked - Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|         PP 9549|   16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|          113783|  26.55| C103|       S|
|         13|       0|     3|Saundercock, Mr. ...|  male|20.0|    0|    0|       A/5. 2151|   8.05| null|       S|
|         14|       0|     3|Andersson, Mr. An...|  male|39.0|    1|    5|          347082| 31.275| null|       S|
|         15|       0|     3|Vestrom, Miss. Hu...|female|14.0|    0|    0|          350406| 7.8542| null|       S|
|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|          248706|   16.0| null|       S|
|         17|       0|     3|Rice, Master. Eugene|  male| 2.0|    4|    1|          382652| 29.125| null|       Q|
|         18|       1|     2|Williams, Mr. Cha...|  male|null|    0|    0|          244373|   13.0| null|       S|
|         19|       0|     3|Vander Planke, Mr...|female|31.0|    1|    0|          345763|   18.0| null|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|            2649|  7.225| null|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 20 rows

# We can also take a look at the schema
train_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

# We now need to decide which of the features provided to us can actually be used to predict whether 
# a passenger survives or drowns. I am going to exclude PassengerId, Name, Parch, Ticket and Cabin 
# because these I don't believe these features influence the outcome. For example, whether a person 
# survives or not doesn't decide on his or her name. We can sometimes get additionall information from 
# these features such as extract the title and see if a person is a doctor or not and use that as a feature. 
# This technique is called Feature Engineering and we will not be focusing on that in thist post. Which features 
# you select is also a personal decision. You may think that number of children a passenger has on-board 
# might matter whereas I might think that it does not. While there are some very obvious features that should 
# be included, others can be tough to select.

train_df = train_df.select(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked'])

# Let's explore the dataset
train_df.describe().show()

# As we can see below, we have 891 rows for 4 columns but two columns (Age and Embarked) 
# have 714 and 889 values. This is because they have missing/NA values.

+-------+-------------------+------------------+------+------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+-----------------+--------+
|  count|                891|               891|   891|               714|              891|     889|
|   mean| 0.3838383838383838| 2.308641975308642|  null| 29.69911764705882| 32.2042079685746|    null|
| stddev|0.48659245426485753|0.8360712409770491|  null|14.526497332334035|49.69342859718089|    null|
|    min|                  0|                 1|female|              0.42|              0.0|       C|
|    max|                  1|                 3|  male|              80.0|         512.3292|       S|
+-------+-------------------+------------------+------+------------------+-----------------+--------+

# Let's clean this data by dropping any rows with null values
train_df_clean = train_df.na.drop()
train_df_clean.describe().show()

# As you can see now, after we drop the rows with null values, we have a total of 712 rows and 
# each column has the same number of values.

+-------+------------------+------------------+------+-----------------+------------------+--------+
|summary|          Survived|            Pclass|   Sex|              Age|              Fare|Embarked|
+-------+------------------+------------------+------+-----------------+------------------+--------+
|  count|               712|               712|   712|              712|               712|     712|
|   mean|0.4044943820224719| 2.240168539325843|  null|29.64209269662921| 34.56725140449432|    null|
| stddev|0.4911389472541192|0.8368543166903446|  null|14.49293290032352|52.938648174710906|    null|
|    min|                 0|                 1|female|             0.42|               0.0|       C|
|    max|                 1|                 3|  male|             80.0|          512.3292|       S|
+-------+------------------+------------------+------+-----------------+------------------+--------+

# Now that we have a feel of what the data looks like and have cleaned it up, we can start analyzing the data.

from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder

# Since we have some categorical features (Sex and Embarked) in our dataset, we need to 'OneHotEncoder' 
# them so that our machine learning model can understand them. I have a separate post that explains 
# OneHotEncoding: http://www.enlistq.com/feature-encoding-python-using-scikit-learn/

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

# We will need to do the same for our other categorical feature - Embarked.

embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

# Now that we have our categorical features encoded, we are ready to convert our training dataset 
# into the vector form that Spark's MLlib expects it to be into. Remember to not include the 'Survived' 
# column as an input because that's what we are trying to predict.

assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'Fare', 'EmbarkedVec'], outputCol='features')

# We are now ready to use Logistic Regression model. I have covered Logistic Regression in my earlier 
# post: http://www.enlistq.com/implementing-a-binomial-logistic-regression-model-in-python/

from pyspark.ml.classification import LogisticRegression
logistic_reg_model = LogisticRegression(featuresCol='features', labelCol='Survived')

# We will now create a pipeline to bring everything together.

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[sex_indexer, embarked_indexer, sex_encoder, embarked_encoder, 
                            assembler, logistic_reg_model])

# We will now fit our logistic regression model on our training dataset
model_fit = pipeline.fit(train_df_clean)

# According to the Kaggle submission rules, we need to predict values for the passengers 
# listed in 'test.csv' and then submit those results to Kaggle.
# The final results should include two columns: PassengerId and Survived.

test_df = spark.read.format('csv').options(header='true', 
                                           inferSchema='true').load('/Users/himanshugupta/kaggle/titanic/test.csv')

test_df = test_df.select(['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked'])
test_df.describe().show()

+-------+------------------+------------------+------+------------------+------------------+--------+
|summary|       PassengerId|            Pclass|   Sex|               Age|              Fare|Embarked|
+-------+------------------+------------------+------+------------------+------------------+--------+
|  count|               418|               418|   418|               332|               417|     418|
|   mean|            1100.5|2.2655502392344498|  null|30.272590361445783|  35.6271884892086|    null|
| stddev|120.81045760473994|0.8418375519640503|  null|14.181209235624424|55.907576179973844|    null|
|    min|               892|                 1|female|              0.17|               0.0|       C|
|    max|              1309|                 3|  male|              76.0|          512.3292|       S|
+-------+------------------+------------------+------+------------------+------------------+--------+

# As we can see, some of the rows don't have Age and/or Fare. We need to fill these with some 
# sensible values. One popular way to fill missing values is to use the mean.

age_mean = test_df.agg({'Age': 'mean'}).first()[0]
fare_mean = test_df.agg({'Fare': 'mean'}).first()[0]
test_df = test_df.fillna(age_mean, subset=['Age'])
test_df = test_df.fillna(fare_mean, subset=['Fare'])

# As we can see now, all columns have the same number of values (418)
test_df.describe().show()

+-------+------------------+------------------+------+------------------+------------------+--------+
|summary|       PassengerId|            Pclass|   Sex|               Age|              Fare|Embarked|
+-------+------------------+------------------+------+------------------+------------------+--------+
|  count|               418|               418|   418|               418|               418|     418|
|   mean|            1100.5|2.2655502392344498|  null|30.272590361445815|  35.6271884892086|    null|
| stddev|120.81045760473994|0.8418375519640503|  null|12.634534168325061|55.840500479541056|    null|
|    min|               892|                 1|female|              0.17|               0.0|       C|
|    max|              1309|                 3|  male|              76.0|          512.3292|       S|
+-------+------------------+------------------+------+------------------+------------------+--------+

# We need to feed the test data to our fitted model and predict.
results = model_fit.transform(test_df)

# Here is what our predictions look like:
kaggle_results = results.select('PassengerId', 'prediction')
kaggle_results.show()

+-----------+----------+
|PassengerId|prediction|
+-----------+----------+
|        892|       0.0|
|        893|       0.0|
|        894|       0.0|
|        895|       0.0|
|        896|       1.0|
|        897|       0.0|
|        898|       0.0|
|        899|       0.0|
|        900|       1.0|
|        901|       0.0|
|        902|       0.0|
|        903|       0.0|
|        904|       1.0|
|        905|       0.0|
|        906|       1.0|
|        907|       1.0|
|        908|       0.0|
|        909|       0.0|
|        910|       1.0|
|        911|       1.0|
+-----------+----------+
only showing top 20 rows

# Write to CSV so we can submit to Kaggle
import pandas as pd
kaggle_results.toPandas().to_csv(r'/Users/himanshugupta/kaggle/titanic/titanic_kaggle_results.csv')

Kaggle: Solving Titanic challenge using logistic regression in Spark

Leave a comment

Cancel reply