Source code for scystream.sdk.spark_manager

import pkg_resources
from pyspark.sql import SparkSession
from scystream.sdk.config import SDKConfig
from scystream.sdk.database_handling.postgres_manager import \
    PostgresConfig, PostgresOperations


[docs] class SparkManager:
[docs] def __init__(self): self.config: SDKConfig = SDKConfig() psql_jar_path = pkg_resources.resource_filename( "scystream.sdk", "spark_jars/postgresql-42.7.4.jar" ) """ When starting the ComputeBlock using Apache Sparks DockerOperator we need to make sure, to start the container in the same network as the spark-worker and the spark-master. Else, the spark jobs will not be executed correctly. """ self.session = SparkSession.builder \ .master(self.config.cb_spark_master) \ .appName(self.config.app_name) \ .config("spark.jars", psql_jar_path) \ .getOrCreate()
[docs] def setup_pg(self, config: PostgresConfig): return PostgresOperations(self.session, config)
[docs] def stop_session(self): if self.spark: self.spark.stop() self.spark = None