Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
bcom-tp-etl-transformation-pipelines
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
general
bcom-tp-etl-transformation-pipelines
Commits
8dff47d1
Commit
8dff47d1
authored
Jul 25, 2023
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update 25-07-23. New parameter in config: cloud_provider
parent
56c7f2a2
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
146 additions
and
50 deletions
+146
-50
Extractor.py
dags/components/Extractor.py
+1
-1
Sensor.py
dags/components/Sensor.py
+26
-13
dag_conf.yml
dags/dag_conf.yml
+18
-17
dag_transformacion_bcom.py
dags/dag_transformacion_bcom.py
+4
-4
dag_transformacion_tacomventas_promoresidencial.py
dags/dag_transformacion_tacomventas_promoresidencial.py
+15
-15
ProviderTypeEnum.py
dags/enums/ProviderTypeEnum.py
+7
-0
procedure_definition2.json
dags/procedure_definition2.json
+28
-0
sync-dags-deployment-gcs.yaml
deploy-k8/sync-dags-deployment-gcs.yaml
+47
-0
No files found.
dags/components/Extractor.py
View file @
8dff47d1
...
@@ -140,7 +140,7 @@ def extract_from_source(command: str, source_conn, intern_conn, chunksize: int,
...
@@ -140,7 +140,7 @@ def extract_from_source(command: str, source_conn, intern_conn, chunksize: int,
logger
.
info
(
"Guardado correctamente todos los datos"
)
logger
.
info
(
"Guardado correctamente todos los datos"
)
source_conn
.
close_basic_connection
()
source_conn
.
close_basic_connection
()
else
:
else
:
if
command
.
replace
(
" "
,
""
)
.
lower
()
.
find
(
"|select"
):
if
command
.
replace
(
" "
,
""
)
.
lower
()
.
find
(
"|select"
)
!=
-
1
:
command
=
command
[
command
.
find
(
"select"
):]
command
=
command
[
command
.
find
(
"select"
):]
steps
=
get_steps
(
command
,
chunksize
,
source_engine
)
steps
=
get_steps
(
command
,
chunksize
,
source_engine
)
# Traemos el iterator
# Traemos el iterator
...
...
dags/components/Sensor.py
View file @
8dff47d1
from
airflow.providers.amazon.aws.sensors.s3
import
S3KeySensor
from
airflow.providers.amazon.aws.sensors.s3
import
S3KeySensor
from
airflow.providers.google.cloud.sensors.gcs
import
GCSObjectExistenceSensor
from
enums.ProviderTypeEnum
import
ProviderTypeEnum
import
logging
import
logging
...
@@ -10,19 +13,29 @@ TIMEOUT = 60*1
...
@@ -10,19 +13,29 @@ TIMEOUT = 60*1
VERIFY_SSL
=
False
VERIFY_SSL
=
False
def
create_s
3_sensor
(
task_id
:
str
,
connection
:
str
,
bucket
:
str
,
key
:
str
)
->
S3KeySensor
:
def
create_s
ensor
(
task_id
:
str
,
connection
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
=
"google"
)
:
s
3_s
ensor
=
None
sensor
=
None
try
:
try
:
s3_sensor
=
S3KeySensor
(
if
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
task_id
=
task_id
,
sensor
=
GCSObjectExistenceSensor
(
bucket_key
=
key
,
task_id
=
task_id
,
bucket_name
=
bucket
,
bucket
=
bucket
,
wildcard_match
=
True
,
object
=
key
,
aws_conn_id
=
connection
,
google_cloud_conn_id
=
connection
,
verify
=
VERIFY_SSL
,
poke_interval
=
POKE_INTERVAL
,
poke_interval
=
POKE_INTERVAL
,
timeout
=
TIMEOUT
timeout
=
TIMEOUT
)
)
else
:
sensor
=
S3KeySensor
(
task_id
=
task_id
,
bucket_key
=
key
,
bucket_name
=
bucket
,
wildcard_match
=
True
,
aws_conn_id
=
connection
,
verify
=
VERIFY_SSL
,
poke_interval
=
POKE_INTERVAL
,
timeout
=
TIMEOUT
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error creando Sensor S3. {e}"
)
logger
.
error
(
f
"Error creando Sensor S3. {e}"
)
return
s
3_s
ensor
return
sensor
dags/dag_conf.yml
View file @
8dff47d1
...
@@ -3,38 +3,39 @@ app:
...
@@ -3,38 +3,39 @@ app:
database
:
database
:
sources
:
sources
:
source1
:
source1
:
type
:
oracle
type
:
postgres
host
:
192.168.27.22
host
:
airflow
port
:
21521
port
:
5432
username
:
PRUEBABCOM2
username
:
airflow
password
:
a
dmin
password
:
a
irflow
database
:
bd_tp_qa
database
:
postgres
service
:
ORCLPDB1
service
:
ORCLPDB1
schema
:
public
schema
:
sources
transformation
:
transformation
:
type
:
mysql
type
:
postgres
host
:
192.168.1.9
host
:
airflow
port
:
13306
port
:
5432
username
:
root
username
:
airflow
password
:
root
password
:
airflow
database
:
p
rueba_bcom
database
:
p
ostgres
service
:
service
:
schema
:
schema
:
intern_db
chunksize
:
8000
chunksize
:
8000
label_multiple_select
:
TABLENAME
label_multiple_select
:
TABLENAME
source_mask
:
select
# Sufijo (S)
source_mask
:
select
# Sufijo (S)
procedure_mask
:
procedure
# S
procedure_mask
:
procedure
# S
transformation_mask
:
transform
# S
transformation_mask
:
transform
# S
prefix_order_delimiter
:
.
prefix_order_delimiter
:
.
cloud_provider
:
google
scripts
:
scripts
:
s3_params
:
s3_params
:
bucket
:
prueba
1234568
bucket
:
prueba
-airflow
prefix
:
bcom_scripts
prefix
:
bcom_scripts
connection_id
:
conn_script
connection_id
:
conn_script
control
:
control
:
s3_params
:
s3_params
:
connection_id
:
conn_script
connection_id
:
conn_script
bucket
:
prueba
1234568
bucket
:
prueba
-airflow
prefix
:
bcom_control
prefix
:
bcom_control
filename
:
control_example.json
filename
:
control_example.json
timezone
:
'
GMT-5'
timezone
:
'
GMT-5'
...
@@ -45,7 +46,7 @@ app:
...
@@ -45,7 +46,7 @@ app:
delimiter
:
'
|'
delimiter
:
'
|'
tmp_path
:
/tmp
tmp_path
:
/tmp
s3_params
:
s3_params
:
bucket
:
prueba
1234568
bucket
:
prueba
-airflow
prefix
:
bcom_results
prefix
:
bcom_results
connection_id
:
conn_script
connection_id
:
conn_script
dags/dag_transformacion_bcom.py
View file @
8dff47d1
...
@@ -9,7 +9,7 @@ from airflow.utils.task_group import TaskGroup
...
@@ -9,7 +9,7 @@ from airflow.utils.task_group import TaskGroup
from
components.Utils
import
update_sql_commands
from
components.Utils
import
update_sql_commands
from
components.Xcom
import
save_commands_to_xcom
from
components.Xcom
import
save_commands_to_xcom
from
components.S3Route
import
get_files_from_prefix
,
get_file_from_key
from
components.S3Route
import
get_files_from_prefix
,
get_file_from_key
from
components.Sensor
import
create_s
3_s
ensor
from
components.Sensor
import
create_sensor
from
components.Extractor
import
get_extract_task_group
from
components.Extractor
import
get_extract_task_group
from
components.Transformation
import
get_transform_task_group
from
components.Transformation
import
get_transform_task_group
from
components.Generation
import
get_generate_task_group
from
components.Generation
import
get_generate_task_group
...
@@ -26,7 +26,7 @@ DAG_NAME = "BCOM_DAG_TRANSFORMACIONES3"
...
@@ -26,7 +26,7 @@ DAG_NAME = "BCOM_DAG_TRANSFORMACIONES3"
# Change this path if is deployed in prod or dev
# Change this path if is deployed in prod or dev
MAIN_PATH
=
"/opt/airflow/dags/"
MAIN_PATH
=
"/opt/airflow/dags/"
JSON_PROCEDURE_PATH
=
MAIN_PATH
+
"procedure_definition.json"
JSON_PROCEDURE_PATH
=
MAIN_PATH
+
"procedure_definition
2
.json"
DEFAULT_ARGS
=
{
DEFAULT_ARGS
=
{
'owner'
:
'BCOM'
,
'owner'
:
'BCOM'
,
...
@@ -149,8 +149,8 @@ def set_dag():
...
@@ -149,8 +149,8 @@ def set_dag():
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"?*"
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"?*"
else
:
else
:
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"/?*"
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"/?*"
sensor_scripts
=
create_s
3_s
ensor
(
"SCRIPTS-SENSOR"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
sensor_scripts
=
create_sensor
(
"SCRIPTS-SENSOR"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
wildcard_scripts
)
wildcard_scripts
,
conf
[
"cloud_provider"
]
)
control_s3
=
conf
[
"control"
][
"s3_params"
]
control_s3
=
conf
[
"control"
][
"s3_params"
]
# Scripts extraction
# Scripts extraction
extract_mask
=
conf
[
"source_mask"
]
extract_mask
=
conf
[
"source_mask"
]
...
...
dags/dag_transformacion_tacomventas_promoresidencial.py
View file @
8dff47d1
...
@@ -6,7 +6,7 @@ import pandas as pd
...
@@ -6,7 +6,7 @@ import pandas as pd
import
numpy
as
np
import
numpy
as
np
from
components.S3Route
import
get_df_from_s3
,
get_base_date
,
save_df_to_s3
,
move_object_s3
from
components.S3Route
import
get_df_from_s3
,
get_base_date
,
save_df_to_s3
,
move_object_s3
from
components.Sensor
import
create_s
3_s
ensor
from
components.Sensor
import
create_sensor
from
components.Utils
import
get_modified_prefix
,
remove_invalid_rows
,
remove_fields
,
update_dict_with_catalogs
from
components.Utils
import
get_modified_prefix
,
remove_invalid_rows
,
remove_fields
,
update_dict_with_catalogs
from
airflow
import
DAG
from
airflow
import
DAG
...
@@ -256,26 +256,26 @@ def set_dag_1():
...
@@ -256,26 +256,26 @@ def set_dag_1():
catalogs_dict
=
update_dict_with_catalogs
(
catalogs_dict
,
conf
,
"no_promocion"
,
s3_prefix
)
catalogs_dict
=
update_dict_with_catalogs
(
catalogs_dict
,
conf
,
"no_promocion"
,
s3_prefix
)
# Define the sensor to verify if data exists or have been updated
# Define the sensor to verify if data exists or have been updated
s3_sensor_tacom
=
create_s
3_s
ensor
(
"S3_sensor_tacom_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_tacom
=
create_sensor
(
"S3_sensor_tacom_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_tacom
)
s3_tacom
)
s3_sensor_promo
=
create_s
3_s
ensor
(
"S3_sensor_promo_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_promo
=
create_sensor
(
"S3_sensor_promo_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_promo
)
s3_promo
)
s3_sensor_promo_catalog
=
create_s
3_s
ensor
(
"S3_sensor_promo_catalog_task"
,
s3_conf
[
"s3_conn_id"
],
s3_sensor_promo_catalog
=
create_sensor
(
"S3_sensor_promo_catalog_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_catalogo_promociones"
])
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_catalogo_promociones"
])
s3_sensor_3a2p
=
create_s
3_s
ensor
(
"S3_sensor_3a2p_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_3a2p
=
create_sensor
(
"S3_sensor_3a2p_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_relacion3pa2p"
])
catalogs_dict
[
"s3_relacion3pa2p"
])
s3_sensor_poid
=
create_s
3_s
ensor
(
"S3_sensor_poid_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_poid
=
create_sensor
(
"S3_sensor_poid_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_relacionpoidpaquete"
])
catalogs_dict
[
"s3_relacionpoidpaquete"
])
s3_sensor_paq
=
create_s
3_s
ensor
(
"S3_sensor_paq_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_paq
=
create_sensor
(
"S3_sensor_paq_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_relacion_paquetes"
])
catalogs_dict
[
"s3_relacion_paquetes"
])
s3_sensor_notpromo
=
create_s
3_s
ensor
(
"S3_sensor_notpromo_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
s3_sensor_notpromo
=
create_sensor
(
"S3_sensor_notpromo_task"
,
s3_conf
[
"s3_conn_id"
],
s3_conf
[
"bucket"
],
catalogs_dict
[
"s3_no_promocion"
])
catalogs_dict
[
"s3_no_promocion"
])
outputs
=
conf
[
"s3_parameters"
][
"outputs"
]
outputs
=
conf
[
"s3_parameters"
][
"outputs"
]
output_prefix
=
outputs
[
"prefix"
]
output_prefix
=
outputs
[
"prefix"
]
...
...
dags/enums/ProviderTypeEnum.py
0 → 100755
View file @
8dff47d1
from
enum
import
Enum
class
ProviderTypeEnum
(
Enum
):
GOOGLE
=
"google"
AMAZON
=
"aws"
MINIO
=
"local"
dags/procedure_definition2.json
0 → 100644
View file @
8dff47d1
[
{
"identifier"
:
"TABLA1"
,
"fields"
:
[
{
"name"
:
"columna1"
,
"datatype"
:
"TEXT"
,
"maxLength"
:
50
},
{
"name"
:
"columna2"
,
"datatype"
:
"NUMBER"
},
{
"name"
:
"columna3"
,
"datatype"
:
"BOOLEAN"
},
{
"name"
:
"columna4"
,
"datatype"
:
"NUMBER"
},
{
"name"
:
"columna5"
,
"datatype"
:
"DECIMAL"
}
]
}
]
\ No newline at end of file
deploy-k8/sync-dags-deployment-gcs.yaml
0 → 100644
View file @
8dff47d1
apiVersion
:
apps/v1
kind
:
Deployment
metadata
:
name
:
airflow-sync-dags
namespace
:
bcom-airflow
spec
:
selector
:
matchLabels
:
app
:
airflow-sync-dags
template
:
metadata
:
labels
:
app
:
airflow-sync-dags
spec
:
containers
:
-
args
:
-
while
true
; gcloud rsync -d -r ${GCS_DAGS_DIR:-gs://prueba-rsync/carpeta} /dags;
do sleep ${SYNCHRONYZE_DAG_DIR:-30}; done;
command
:
-
/bin/bash
-
-c
-
--
name
:
sync-dags-gcloud
image
:
gcr.io/google.com/cloudsdktool/google-cloud-cli:alpine
envFrom
:
-
configMapRef
:
name
:
airflow-envvars-configmap
env
:
-
name
:
AWS_ACCESS_KEY_ID
valueFrom
:
secretKeyRef
:
key
:
AWS_ACCESS_KEY
name
:
credentials
-
name
:
AWS_SECRET_ACCESS_KEY
valueFrom
:
secretKeyRef
:
key
:
AWS_SECRET_KEY
name
:
credentials
volumeMounts
:
-
name
:
dags-host-volume
mountPath
:
/dags
volumes
:
-
name
:
dags-host-volume
persistentVolumeClaim
:
claimName
:
airflow-dags-pvc
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment