Source code for scystream.sdk.spark_manager
import pkg_resources
from pyspark.sql import SparkSession
from scystream.sdk.config import SDKConfig
from scystream.sdk.database_handling.postgres_manager import \
PostgresConfig, PostgresOperations
[docs]
class SparkManager:
[docs]
def __init__(self):
self.config: SDKConfig = SDKConfig()
psql_jar_path = pkg_resources.resource_filename(
"scystream.sdk", "spark_jars/postgresql-42.7.4.jar"
)
"""
When starting the ComputeBlock using Apache Sparks DockerOperator
we need to make sure, to start the container in the same network
as the spark-worker and the spark-master.
Else, the spark jobs will not be executed correctly.
"""
self.session = SparkSession.builder \
.master(self.config.cb_spark_master) \
.appName(self.config.app_name) \
.config("spark.jars", psql_jar_path) \
.getOrCreate()
[docs]
def setup_pg(self, config: PostgresConfig):
return PostgresOperations(self.session, config)
[docs]
def stop_session(self):
if self.spark:
self.spark.stop()
self.spark = None