Files
spark/pet-project/dag_spark_create_dm_pg.py
2026-06-12 16:30:32 +07:00

41 lines
1.1 KiB
Python

from airflow import DAG
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from datetime import datetime, timedelta
default_args = {
"owner": "anazarenko",
"depends_on_past": False,
"start_date": datetime(2025, 1, 1),
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
with DAG(
dag_id="spark_create_dm_pg",
start_date=datetime(2026, 1, 1),
schedule=None,
catchup=False,
default_args=default_args,
tags=["spark", "postgres"],
) as dag:
spark_task = SparkSubmitOperator(
task_id="run_spark_job",
application="/opt/airflow/dag/anazarenko/spark/create_dm_pg.py",
conn_id="spark_default",
name="airflow-spark",
application_args=[
"--input", "s3a://anazarenko-bucket/input/",
"--output", "s3a://anazarenko-bucket/output/",
],
conf={
"spark.eventlog.enabled": "true",
"spark.eventlog.dir": "file:/opt/spark-events",
"spark.eventlog.compress": "true"
},
#packages="org.postgresql:postgresql:42.7.3",
verbose=True,
)