Apache Spark2 installation in Linux


$ cd ${HOME}
$ wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz

$ ls
spark-2.4.3-bin-hadoop2.7.tgz

$ tar xf spark-2.4.3-bin-hadoop2.7.tgz -C ./

$ sudo mkdir -p /opt/spark-2.4.3
$ sudo mv spark-2.4.3-bin-hadoop2.7/* /opt/spark-2.4.3
$ sudo ln -s /opt/spark-2.4.3/ /opt/spark2


$ rm -rf spark-2.4.3-bin-hadoop2.7/
$ rm -rf spark-2.4.3-bin-hadoop2.7.tgz


$ sudo vi /etc/profile.d/spark.sh


#### SPARK 2.4.3 #######################

export SPARK_HOME=/opt/spark
export PATH=${SPARK_HOME}/bin:$PATH

#### SPARK 2.4.3 #######################


$ source /etc/profile.d/spark.sh


Test installation

$ spark-shell
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/


$ cd ${SPARK_HOME}/bin
$ ./run-example SparkPi 10


$ pyspark --master local[2]


PySpark and jupyter-notebook installation

// if jdk not installed
$ sudo apt install -y openjdk-8-jdk
$ sudo update-alternatives --config java
$ java -version


// if scala not installed
$ sudo apt install -y scala


$ sudo apt install -y python3-pip


$ pip3 install py4j
$ pip3 install jupyter


$ sudo vi /etc/profile.d/pyspark.sh


#### SPARK #######################

export PATH=$PATH:~/.local/bin/
export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH

export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
export PYSPARK_PYTHON='python3'

#### SPARK #######################


 $ source /etc/profile.d/pyspark.sh


$ sudo chmod -R 777 /opt/spark/

$ pip3 install findspark


$ cd /opt/spark/current/python/
$ jupyter-notebook --ip 192.168.0.11 --port 8080


new python3 notebook


import findspark
findspark.init('/opt/spark/current')
import pyspark