41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
from airflow import DAG
|
|
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
default_args = {
|
|
"owner": "anazarenko",
|
|
"depends_on_past": False,
|
|
"start_date": datetime(2025, 1, 1),
|
|
"retries": 1,
|
|
"retry_delay": timedelta(minutes=5),
|
|
}
|
|
|
|
with DAG(
|
|
dag_id="spark_create_dm_pg",
|
|
start_date=datetime(2026, 1, 1),
|
|
schedule=None,
|
|
catchup=False,
|
|
default_args=default_args,
|
|
tags=["spark", "postgres"],
|
|
) as dag:
|
|
|
|
spark_task = SparkSubmitOperator(
|
|
task_id="run_spark_job",
|
|
application="/opt/airflow/dag/anazarenko/spark/create_dm_pg.py",
|
|
conn_id="spark_default",
|
|
name="airflow-spark",
|
|
|
|
application_args=[
|
|
"--input", "s3a://anazarenko-bucket/input/",
|
|
"--output", "s3a://anazarenko-bucket/output/",
|
|
],
|
|
conf={
|
|
"spark.eventlog.enabled": "true",
|
|
"spark.eventlog.dir": "file:/opt/spark-events",
|
|
"spark.eventlog.compress": "true"
|
|
},
|
|
#packages="org.postgresql:postgresql:42.7.3",
|
|
verbose=True,
|
|
)
|
|
|