from airflow import DAG from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator from datetime import datetime, timedelta default_args = { "owner": "anazarenko", "depends_on_past": False, "start_date": datetime(2025, 1, 1), "retries": 1, "retry_delay": timedelta(minutes=5), } with DAG( dag_id="spark_create_dm_pg", start_date=datetime(2026, 1, 1), schedule=None, catchup=False, default_args=default_args, tags=["spark", "postgres"], ) as dag: spark_task = SparkSubmitOperator( task_id="run_spark_job", application="/opt/airflow/dag/anazarenko/spark/create_dm_pg.py", conn_id="spark_default", name="airflow-spark", application_args=[ "--input", "s3a://anazarenko-bucket/input/", "--output", "s3a://anazarenko-bucket/output/", ], conf={ "spark.eventlog.enabled": "true", "spark.eventlog.dir": "file:/opt/spark-events", "spark.eventlog.compress": "true" }, #packages="org.postgresql:postgresql:42.7.3", verbose=True, )