Erro ao executar: spark = SparkSession.builder.master('local[*]').getOrCreate()

Boa tarde!

Ao tentar executar o comando: spark = SparkSession.builder.master('local[*]').getOrCreate() ocorre o seguinte erro:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
Input In [70], in <cell line: 1>()
----> 1 spark = SparkSession.builder.master('local[*]').getOrCreate()

File C:\Spark\spark-3.4.1-bin-hadoop3\python\pyspark\sql\session.py:477, in SparkSession.Builder.getOrCreate(self)
    475     sparkConf.set(key, value)
    476 # This SparkContext may be an existing one.
--> 477 sc = SparkContext.getOrCreate(sparkConf)
    478 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    479 # by all sessions.
    480 session = SparkSession(sc, options=self._options)

File C:\Spark\spark-3.4.1-bin-hadoop3\python\pyspark\context.py:512, in SparkContext.getOrCreate(cls, conf)
    510 with SparkContext._lock:
    511     if SparkContext._active_spark_context is None:
--> 512         SparkContext(conf=conf or SparkConf())
    513     assert SparkContext._active_spark_context is not None
    514     return SparkContext._active_spark_context

File C:\Spark\spark-3.4.1-bin-hadoop3\python\pyspark\context.py:200, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls, memory_profiler_cls)
    198 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    199 try:
--> 200     self._do_init(
    201         master,
    202         appName,
    203         sparkHome,
    204         pyFiles,
    205         environment,
    206         batchSize,
    207         serializer,
    208         conf,
    209         jsc,
    210         profiler_cls,
    211         udf_profiler_cls,
    212         memory_profiler_cls,
    213     )
    214 except BaseException:
    215     # If an error occurs, clean up in order to allow future SparkContext creation:
    216     self.stop()

File C:\Spark\spark-3.4.1-bin-hadoop3\python\pyspark\context.py:287, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls, udf_profiler_cls, memory_profiler_cls)
    284 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
    286 # Create the Java SparkContext through Py4J
--> 287 self._jsc = jsc or self._initialize_context(self._conf._jconf)
    288 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
    289 self._conf = SparkConf(_jconf=self._jsc.sc().conf())

File C:\Spark\spark-3.4.1-bin-hadoop3\python\pyspark\context.py:417, in SparkContext._initialize_context(self, jconf)
    413 """
    414 Initialize SparkContext in function to allow subclass specific initialization
    415 """
    416 assert self._jvm is not None
--> 417 return self._jvm.JavaSparkContext(jconf)

File C:\Spark\spark-3.4.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py:1587, in JavaClass.__call__(self, *args)
   1581 command = proto.CONSTRUCTOR_COMMAND_NAME +\
   1582     self._command_header +\
   1583     args_command +\
   1584     proto.END_COMMAND_PART
   1586 answer = self._gateway_client.send_command(command)
-> 1587 return_value = get_return_value(
   1588     answer, self._gateway_client, None, self._fqn)
   1590 for temp_arg in temp_args:
   1591     if hasattr(temp_arg, "_detach"):

File C:\Spark\spark-3.4.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
    324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325 if answer[1] == REFERENCE_TYPE:
--> 326     raise Py4JJavaError(
    327         "An error occurred while calling {0}{1}{2}.\n".
    328         format(target_id, ".", name), value)
    329 else:
    330     raise Py4JError(
    331         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
    332         format(target_id, ".", name, value))

Py4JJavaError: An error occurr.java:106)

Alguém passou por este erro?

Desde já, obrigada

Oi Francine,

Eu notei que você está tentando rodar o projeto localmente e dessa forma fica difícil identificar o problema para poder te ajudar. É provável que esteja relacionado com a configuração de suas variáveis de ambiente e nesse sentido a postagem de nosso colega Eduardo talvez possa te ajudar.

Dê preferência para rodar o projeto no Colab. A partir das novas versões para rodar o projeto basta apenas instalar o pyspark e seguir o curso. Não precisa configurar variáveis de ambiente, baixar Spark etc.

Basta apenas rodar o seguinte código e seguir com o restante do curso:

!pip install pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

Lembrando que este procedimento funciona no Colab e eu aconselho que você use o Colab nos seus estudos.

Espero ter ajudado e bons estudos

Configurar as Variaveis de ambientes.

Variaveis do sistema