Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
bcom-tp-etl-transformation-pipelines
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
general
bcom-tp-etl-transformation-pipelines
Commits
c22cefd9
Commit
c22cefd9
authored
Jul 27, 2023
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update 26-07-23. Add new provider: GCP to get inputs and save results.
parent
8dff47d1
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
126 additions
and
69 deletions
+126
-69
Cleaning.py
dags/components/Cleaning.py
+4
-4
DatabaseExtraction.py
dags/components/DatabaseOperation/DatabaseExtraction.py
+1
-1
DatabaseTransformation.py
dags/components/DatabaseOperation/DatabaseTransformation.py
+1
-1
Extractor.py
dags/components/Extractor.py
+7
-7
Generation.py
dags/components/Generation.py
+9
-7
InsumoModel.py
dags/components/Model/InsumoModel.py
+2
-2
S3Route.py
dags/components/S3Route.py
+52
-17
Sensor.py
dags/components/Sensor.py
+24
-4
Transformation.py
dags/components/Transformation.py
+6
-5
Utils.py
dags/components/Utils.py
+0
-2
dag_transformacion_bcom.py
dags/dag_transformacion_bcom.py
+19
-18
dag_transformacion_tacomventas_promoresidencial.py
dags/dag_transformacion_tacomventas_promoresidencial.py
+1
-1
No files found.
dags/components/Cleaning.py
View file @
c22cefd9
...
...
@@ -16,7 +16,7 @@ import logging
logger
=
logging
.
getLogger
()
def
validate_clean
(
control_params
:
Dict
[
str
,
Any
],
**
kwargs
)
->
None
:
def
validate_clean
(
control_params
:
Dict
[
str
,
Any
],
provider
:
str
,
**
kwargs
)
->
None
:
delete_task_instances
()
ti
=
kwargs
[
"ti"
]
conf
=
ti
.
xcom_pull
(
task_ids
=
"VALIDATE_GENERATOR"
,
key
=
"CONTROL-CONFIG"
)
...
...
@@ -27,7 +27,7 @@ def validate_clean(control_params: Dict[str, Any], **kwargs) -> None:
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
...
...
@@ -61,7 +61,7 @@ def get_cleaners_from_xcom(**kwargs):
return
[[
item
]
for
item
in
final_selects
]
def
get_cleaning_task_group
(
db_intern_conn
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
or
None
:
def
get_cleaning_task_group
(
db_intern_conn
,
control_s3
:
Dict
[
str
,
Any
]
,
provider
:
str
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"LimpiezaDelProceso"
,
prefix_group_id
=
False
)
as
group
:
...
...
@@ -76,7 +76,7 @@ def get_cleaning_task_group(db_intern_conn, control_s3: Dict[str, Any]) -> TaskG
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_CLEANER"
,
python_callable
=
validate_clean
,
op_kwargs
=
{
'control_params'
:
control_s3
},
op_kwargs
=
{
'control_params'
:
control_s3
,
'provider'
:
provider
},
trigger_rule
=
'none_skipped'
)
cleaners
>>
tasks
>>
validate_task
...
...
dags/components/DatabaseOperation/DatabaseExtraction.py
View file @
c22cefd9
...
...
@@ -8,7 +8,7 @@ def get_steps(sql_command: str, chunksize: int, connection, is_tablename: bool =
final_steps
=
0
try
:
if
is_tablename
:
count_command
=
f
"SELECT COUNT(*) FROM {sql_command}"
count_command
=
f
'SELECT COUNT(*) FROM "{sql_command}"'
else
:
count_command
=
f
"SELECT COUNT(*) FROM ({sql_command}) BCOM"
with
connection
.
connect
()
as
conn
:
...
...
dags/components/DatabaseOperation/DatabaseTransformation.py
View file @
c22cefd9
...
...
@@ -18,7 +18,7 @@ def execute_transformations(commands: List[str], engine):
def
delete_table
(
tablename
:
str
,
engine
)
->
bool
:
delete
=
False
try
:
command
=
f
"DROP TABLE {tablename}"
command
=
f
'DROP TABLE "{tablename}"'
start_time
=
time
.
time
()
with
engine
.
connect
()
as
conn
:
try
:
...
...
dags/components/Extractor.py
View file @
c22cefd9
from
typing
import
Any
,
Dict
import
json
import
numpy
as
np
import
pandas
as
pd
from
enums.DatabaseTypeEnum
import
DatabaseTypeEnum
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
components.Utils
import
select_multiple
,
generateModel
from
components.DatabaseOperation.DatabaseExtraction
import
get_iterator
,
get_steps
...
...
@@ -23,7 +23,7 @@ import logging
logger
=
logging
.
getLogger
()
def
validate_extractor
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
**
kwargs
)
->
None
:
def
validate_extractor
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
provider
:
str
,
**
kwargs
)
->
None
:
delete_task_instances
()
ti
=
kwargs
[
"ti"
]
success_tasks
=
ti
.
xcom_pull
(
task_ids
=
"EXTRACTORS"
,
key
=
"SUCCESS_TASKS"
)
...
...
@@ -49,7 +49,7 @@ def validate_extractor(control_params: Dict[str, Any], timezone: str, **kwargs)
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
...
...
@@ -141,14 +141,14 @@ def extract_from_source(command: str, source_conn, intern_conn, chunksize: int,
source_conn
.
close_basic_connection
()
else
:
if
command
.
replace
(
" "
,
""
)
.
lower
()
.
find
(
"|select"
)
!=
-
1
:
command
=
command
[
command
.
find
(
"select"
):]
command
=
command
[
command
.
lower
()
.
find
(
"select"
):]
steps
=
get_steps
(
command
,
chunksize
,
source_engine
)
# Traemos el iterator
iterator
=
get_iterator
(
command
,
chunksize
,
source_engine
)
logger
.
info
(
f
"Número de pasos para migrar datos: {steps}"
)
for
step
in
range
(
steps
):
dataframe
=
next
(
iterator
)
dataframe
[
"INTERN_ID_BCOM"
]
=
None
# dataframe["INTERN_ID_BCOM"] = np.NaN
logger
.
debug
(
dataframe
)
save
=
save_from_dataframe
(
dataframe
,
tablename
,
intern_conn
.
engine
)
if
save
:
...
...
@@ -186,7 +186,7 @@ def get_select_from_xcom(**kwargs):
def
get_extract_task_group
(
db_source_conn
,
db_intern_conn
,
chunksize
:
int
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
or
None
:
control_s3
:
Dict
[
str
,
Any
]
,
provider
:
str
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"ExtraccionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
...
...
@@ -203,7 +203,7 @@ def get_extract_task_group(db_source_conn, db_intern_conn, chunksize: int, timez
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_EXTRACTION"
,
python_callable
=
validate_extractor
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
},
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
,
'provider'
:
provider
},
trigger_rule
=
'all_done'
)
selects
>>
tasks
>>
validate_task
...
...
dags/components/Generation.py
View file @
c22cefd9
...
...
@@ -20,7 +20,7 @@ import logging
logger
=
logging
.
getLogger
()
def
validate_generate
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
**
kwargs
)
->
None
:
def
validate_generate
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
provider
:
str
,
**
kwargs
)
->
None
:
delete_task_instances
()
ti
=
kwargs
[
"ti"
]
success_tasks
=
ti
.
xcom_pull
(
task_ids
=
"GENERATORS"
,
key
=
"SUCCESS_TASKS"
)
...
...
@@ -46,7 +46,7 @@ def validate_generate(control_params: Dict[str, Any], timezone: str, **kwargs) -
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
...
...
@@ -78,7 +78,8 @@ def on_success_generator(context) -> None:
ti
.
xcom_push
(
key
=
"SUCCESS_TASKS"
,
value
=
task_name
)
def
generate_and_deploy
(
command
:
str
,
intern_conn
,
params
:
Dict
[
str
,
Any
],
timezone
:
str
,
chunksize
=
10000
):
def
generate_and_deploy
(
command
:
str
,
intern_conn
,
params
:
Dict
[
str
,
Any
],
timezone
:
str
,
provider
:
str
,
chunksize
=
10000
):
engine
=
intern_conn
.
engine
logger
.
debug
(
f
"COMANDO: {command}"
)
tablename
=
select_multiple
(
command
)[
"tablename"
]
...
...
@@ -113,7 +114,7 @@ def generate_and_deploy(command: str, intern_conn, params: Dict[str, Any], timez
file_key
=
prefix
+
tmp_file
[
tmp_file
.
rfind
(
"/"
)
+
1
:]
# Se sube el archivo al S3
logger
.
info
(
f
"Tamaño del archivo a subir: {os.path.getsize(tmp_file)} bytes"
)
save_df_to_s3
(
tmp_file
,
conn_id
,
bucket
,
file_key
,
in_memory
=
False
)
save_df_to_s3
(
tmp_file
,
conn_id
,
bucket
,
file_key
,
provider
,
in_memory
=
False
)
# Se borra el archivo al finalizar el upload
delete_temp_dir
(
tmp_file
)
...
...
@@ -144,7 +145,7 @@ def get_generate_from_xcom(**kwargs):
def
get_generate_task_group
(
db_intern_conn
,
parameters
:
Dict
[
str
,
Any
],
control_s3
:
Dict
[
str
,
Any
],
timezone
:
str
)
->
TaskGroup
or
None
:
timezone
:
str
,
provider
:
str
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"GeneracionyDespliegueDeResultados"
,
prefix_group_id
=
False
)
as
group
:
...
...
@@ -155,13 +156,14 @@ def get_generate_task_group(db_intern_conn, parameters: Dict[str, Any], control_
python_callable
=
generate_and_deploy
,
on_failure_callback
=
on_failure_generator
,
on_success_callback
=
on_success_generator
,
op_kwargs
=
{
'intern_conn'
:
db_intern_conn
,
'params'
:
parameters
,
'timezone'
:
timezone
}
op_kwargs
=
{
'intern_conn'
:
db_intern_conn
,
'params'
:
parameters
,
'timezone'
:
timezone
,
'provider'
:
provider
}
)
.
expand
(
op_args
=
outputs
)
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_GENERATOR"
,
python_callable
=
validate_generate
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
},
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
,
'provider'
:
provider
},
trigger_rule
=
'none_skipped'
)
outputs
>>
tasks
>>
validate_task
...
...
dags/components/Model/InsumoModel.py
View file @
c22cefd9
from
sqlalchemy.ext.declarative
import
declarative_base
from
sqlalchemy
import
Column
,
B
igInteger
from
sqlalchemy
import
Column
,
B
IGINT
Base
=
declarative_base
()
...
...
@@ -8,4 +8,4 @@ class InsumoModel(Base):
__abstract__
=
True
INTERN_ID_BCOM
=
Column
(
B
igInteger
,
primary_key
=
True
,
autoincrement
=
True
)
INTERN_ID_BCOM
=
Column
(
B
IGINT
,
primary_key
=
True
,
autoincrement
=
True
)
dags/components/S3Route.py
View file @
c22cefd9
import
fnmatch
import
datetime
from
typing
import
Any
,
Dict
,
List
,
Tuple
import
json
import
pytz
from
io
import
BytesIO
,
StringIO
import
pandas
as
pd
...
...
@@ -9,8 +8,10 @@ import pandas as pd
from
components.Utils
import
get_type_file
from
enums.FileTypeEnum
import
FileTypeEnum
from
enums.ScriptFileTypeEnum
import
ScriptFileTypeEnum
from
enums.ProviderTypeEnum
import
ProviderTypeEnum
from
airflow.providers.amazon.aws.hooks.s3
import
S3Hook
from
airflow.contrib.hooks.gcs_hook
import
GoogleCloudStorageHook
import
logging
logger
=
logging
.
getLogger
()
...
...
@@ -90,18 +91,26 @@ def get_base_date(conn: str, bucket: str, key: str) -> datetime.date:
return
last_date
def
save_df_to_s3
(
data
:
pd
.
DataFrame
or
str
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
delimiter
:
str
=
","
,
def
save_df_to_s3
(
data
:
pd
.
DataFrame
or
str
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
,
delimiter
:
str
=
","
,
in_memory
:
bool
=
True
):
try
:
logger
.
info
(
f
"SUBIENDO A NUBE KEY {key}"
)
file_type
=
get_type_file
(
key
)
s3_hook
=
S3Hook
(
conn
)
gcp_cloud
=
False
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
hook
=
S3Hook
(
conn
)
else
:
hook
=
GoogleCloudStorageHook
(
conn
)
gcp_cloud
=
True
if
file_type
==
FileTypeEnum
.
EXCEL
or
file_type
==
FileTypeEnum
.
OLD_EXCEL
:
if
in_memory
:
with
BytesIO
()
as
buffer
:
with
pd
.
ExcelWriter
(
buffer
,
engine
=
'xlsxwriter'
)
as
writer
:
data
.
to_excel
(
writer
,
index
=
None
)
s3_hook
.
load_bytes
(
buffer
.
getvalue
(),
key
,
bucket
,
True
)
if
gcp_cloud
:
hook
.
upload
(
bucket
,
key
,
data
=
buffer
.
getvalue
())
else
:
hook
.
load_bytes
(
buffer
.
getvalue
(),
key
,
bucket
,
True
)
else
:
pass
elif
file_type
==
FileTypeEnum
.
CSV
or
file_type
==
FileTypeEnum
.
TEXT
:
...
...
@@ -109,9 +118,15 @@ def save_df_to_s3(data: pd.DataFrame or str, conn: str, bucket: str, key: str, d
csv_buffer
=
BytesIO
()
data
.
to_csv
(
csv_buffer
,
header
=
True
,
index
=
False
,
sep
=
delimiter
,
na_rep
=
'None'
)
csv_buffer
.
seek
(
0
)
s3_hook
.
load_bytes
(
csv_buffer
.
getvalue
(),
key
,
bucket
,
True
)
if
gcp_cloud
:
hook
.
upload
(
bucket
,
key
,
data
=
csv_buffer
.
getvalue
())
else
:
hook
.
load_bytes
(
csv_buffer
.
getvalue
(),
key
,
bucket
,
True
)
else
:
s3_hook
.
load_file
(
data
,
key
,
bucket
)
if
gcp_cloud
:
hook
.
upload
(
bucket
,
key
,
data
)
else
:
hook
.
load_file
(
data
,
key
,
bucket
)
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando archivos a S3. key: {key}. {e}"
)
...
...
@@ -127,17 +142,28 @@ def move_object_s3(conn: str, bucket: str, source_key: str, output_key: str) ->
logger
.
error
(
f
"Error moviendo archivo desde {source_key} hacia {output_key} en bucket {bucket}. {e}"
)
def
get_files_from_prefix
(
conn
:
str
,
bucket
:
str
,
prefix
:
str
)
->
List
[
Tuple
[
str
,
str
]]:
def
get_files_from_prefix
(
conn
:
str
,
bucket
:
str
,
prefix
:
str
,
provider
:
str
)
->
List
[
Tuple
[
str
,
str
]]:
result
=
[]
allowed_filetypes
=
[
ScriptFileTypeEnum
[
item
]
.
value
for
item
in
ScriptFileTypeEnum
.
_member_names_
]
try
:
s3_hook
=
S3Hook
(
conn
)
files
=
s3_hook
.
list_keys
(
bucket
,
prefix
)
files
=
[]
s3_hook
,
gcp_hook
,
data
=
None
,
None
,
None
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
conn
)
files
=
s3_hook
.
list_keys
(
bucket
,
prefix
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
if
not
prefix
.
endswith
(
"/"
):
prefix
+=
"/"
files
=
gcp_hook
.
list
(
bucket
,
prefix
=
prefix
)
logger
.
debug
(
f
"Archivos encontrados en el prefijo {prefix}: {files}"
)
for
file
in
files
:
if
file
.
endswith
(
"/"
)
or
file
[
file
.
rfind
(
"."
)
+
1
:]
.
lower
()
not
in
allowed_filetypes
:
continue
data
=
s3_hook
.
get_key
(
file
,
bucket
)
.
get
()[
'Body'
]
.
read
()
.
decode
(
"utf-8"
)
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
data
=
s3_hook
.
get_key
(
file
,
bucket
)
.
get
()[
'Body'
]
.
read
()
.
decode
(
"utf-8"
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
data
=
gcp_hook
.
download
(
bucket
,
file
)
.
decode
(
"utf-8"
)
if
file
.
find
(
"/"
)
==
-
1
:
filename
=
file
else
:
...
...
@@ -150,12 +176,17 @@ def get_files_from_prefix(conn: str, bucket: str, prefix: str) -> List[Tuple[str
return
result
def
get_file_from_key
(
conn
:
str
,
bucket
:
str
,
key
:
str
)
->
Any
:
def
get_file_from_key
(
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
)
->
Any
:
result
=
BytesIO
()
try
:
s3_hook
=
S3Hook
(
conn
)
data
=
s3_hook
.
get_key
(
key
,
bucket
)
data
.
download_fileobj
(
result
)
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
conn
)
data
=
s3_hook
.
get_key
(
key
,
bucket
)
data
.
download_fileobj
(
result
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
result
=
gcp_hook
.
download
(
bucket
,
key
)
print
(
"RESULT:"
,
result
)
except
Exception
as
e
:
result
=
None
logger
.
error
(
f
"Error extrayendo archivo {key}. {e}"
)
...
...
@@ -163,11 +194,15 @@ def get_file_from_key(conn: str, bucket: str, key: str) -> Any:
return
result
def
load_obj_to_s3
(
obj
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
replace
=
True
)
->
bool
:
def
load_obj_to_s3
(
obj
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
,
replace
=
True
)
->
bool
:
load
=
False
try
:
s3_hook
=
S3Hook
(
conn
)
s3_hook
.
load_bytes
(
obj
,
key
,
bucket
,
replace
)
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
conn
)
s3_hook
.
load_bytes
(
obj
,
key
,
bucket
,
replace
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
gcp_hook
.
upload
(
bucket
,
key
,
data
=
obj
)
load
=
True
except
Exception
as
e
:
logger
.
error
(
f
"Error subiendo archivo de control a bucket {bucket} y key {key}. {e}"
)
...
...
dags/components/Sensor.py
View file @
c22cefd9
from
airflow.providers.amazon.aws.sensors.s3
import
S3KeySensor
from
airflow.providers.google.cloud.sensors.gcs
import
GCSObjectExistenceSensor
from
airflow.providers.google.cloud.hooks.gcs
import
GCSHook
from
airflow.sensors.base
import
BaseSensorOperator
from
enums.ProviderTypeEnum
import
ProviderTypeEnum
...
...
@@ -17,11 +18,11 @@ def create_sensor(task_id: str, connection: str, bucket: str, key: str, provider
sensor
=
None
try
:
if
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
sensor
=
GC
SObjectExistence
Sensor
(
sensor
=
GC
P
Sensor
(
task_id
=
task_id
,
conn
=
connection
,
bucket
=
bucket
,
object
=
key
,
google_cloud_conn_id
=
connection
,
key
=
key
,
poke_interval
=
POKE_INTERVAL
,
timeout
=
TIMEOUT
)
...
...
@@ -39,3 +40,22 @@ def create_sensor(task_id: str, connection: str, bucket: str, key: str, provider
except
Exception
as
e
:
logger
.
error
(
f
"Error creando Sensor S3. {e}"
)
return
sensor
class
GCPSensor
(
BaseSensorOperator
):
def
__init__
(
self
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
**
kwargs
)
->
None
:
self
.
conn
=
conn
self
.
bucket
=
bucket
self
.
key
=
key
super
()
.
__init__
(
**
kwargs
)
def
poke
(
self
,
context
):
hook
=
GCSHook
(
self
.
conn
)
end_prefix_index
=
self
.
key
.
rfind
(
"/"
)
if
end_prefix_index
!=
-
1
and
len
(
self
.
key
[
end_prefix_index
:])
>
1
:
self
.
key
=
self
.
key
[:
end_prefix_index
+
1
]
files
=
hook
.
list
(
self
.
bucket
,
prefix
=
self
.
key
)
files
=
list
(
map
(
lambda
x
:
not
x
.
endswith
(
"/"
),
files
))
return
any
([
criteria
for
criteria
in
files
])
dags/components/Transformation.py
View file @
c22cefd9
...
...
@@ -17,7 +17,7 @@ import logging
logger
=
logging
.
getLogger
()
def
validate_transform
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
**
kwargs
)
->
None
:
def
validate_transform
(
control_params
:
Dict
[
str
,
Any
],
timezone
:
str
,
provider
:
str
,
**
kwargs
)
->
None
:
delete_task_instances
()
ti
=
kwargs
[
"ti"
]
success_tasks
=
ti
.
xcom_pull
(
task_ids
=
"TRANSFORMATIONS"
,
key
=
"SUCCESS_TASKS"
)
...
...
@@ -43,7 +43,7 @@ def validate_transform(control_params: Dict[str, Any], timezone: str, **kwargs)
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
)
loaded
=
load_obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
...
...
@@ -82,7 +82,7 @@ def transformations(xcom_commands: str, intern_conn):
logger
.
info
(
f
"Ejecutando transformaciones del script {script_name}"
)
with
engine
.
connect
()
as
connection
:
for
command
in
commands
:
logger
.
info
(
f
"Ejecutando comando de transformación: {command}"
)
logger
.
debug
(
f
"Ejecutando comando de transformación: {command}"
)
_
=
connection
.
execute
(
command
)
...
...
@@ -112,7 +112,8 @@ def get_trans_from_xcom(**kwargs):
return
[[
item
]
for
item
in
transforms_per_file
]
def
get_transform_task_group
(
db_intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
or
None
:
def
get_transform_task_group
(
db_intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
],
provider
:
str
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"TransformacionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
...
...
@@ -129,7 +130,7 @@ def get_transform_task_group(db_intern_conn, timezone: str, control_s3: Dict[str
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_TRANSFORMATION"
,
python_callable
=
validate_transform
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
},
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
,
'provider'
:
provider
},
trigger_rule
=
'none_skipped'
)
transforms
>>
tasks
>>
validate_task
...
...
dags/components/Utils.py
View file @
c22cefd9
...
...
@@ -103,8 +103,6 @@ def update_sql_commands(dataset: List[Tuple[str, str]], label_tablename: str) ->
final_data
=
[]
table_name
=
""
for
item
in
data
:
# if item.strip().startswith("--") and label_tablename.strip()+":" not in item:
# continue
if
item
.
lower
()
.
strip
()
==
"end"
:
final_data
[
-
1
]
=
final_data
[
-
1
]
+
"; end;"
final_item
=
item
...
...
dags/dag_transformacion_bcom.py
View file @
c22cefd9
...
...
@@ -39,10 +39,10 @@ DEFAULT_ARGS = {
}
def
cleaning
(
intern_conn
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
:
def
cleaning
(
intern_conn
,
control_s3
:
Dict
[
str
,
Any
]
,
provider
:
str
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_cleaning_task_group
(
intern_conn
,
control_s3
)
groups
=
get_cleaning_task_group
(
intern_conn
,
control_s3
,
provider
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
finally
:
...
...
@@ -50,20 +50,20 @@ def cleaning(intern_conn, control_s3: Dict[str, Any]) -> TaskGroup:
def
generate_and_deploy_results
(
intern_conn
,
parameters
:
Dict
[
str
,
Any
],
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
:
control_s3
:
Dict
[
str
,
Any
]
,
provider
:
str
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_generate_task_group
(
intern_conn
,
parameters
,
control_s3
,
timezone
)
groups
=
get_generate_task_group
(
intern_conn
,
parameters
,
control_s3
,
timezone
,
provider
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de creación y despliegue de resultados. {e}"
)
finally
:
return
groups
def
transformation
(
intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
])
->
TaskGroup
:
def
transformation
(
intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
]
,
provider
:
str
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_transform_task_group
(
intern_conn
,
timezone
,
control_s3
)
groups
=
get_transform_task_group
(
intern_conn
,
timezone
,
control_s3
,
provider
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
finally
:
...
...
@@ -71,10 +71,10 @@ def transformation(intern_conn, timezone: str, control_s3: Dict[str, Any]) -> Ta
def
extraction
(
source_conn
,
intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
],
chunksize
:
int
=
100000
)
->
TaskGroup
:
provider
:
str
,
chunksize
:
int
=
100000
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_extract_task_group
(
source_conn
,
intern_conn
,
chunksize
,
timezone
,
control_s3
)
groups
=
get_extract_task_group
(
source_conn
,
intern_conn
,
chunksize
,
timezone
,
control_s3
,
provider
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de extracción de datos. {e}"
)
finally
:
...
...
@@ -92,13 +92,13 @@ def save_procedure_json(json_path: str, task) -> None:
logger
.
error
(
f
"Error leyendo y guardando archivo descriptor de procedure. {e}"
)
def
extract_control
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
filename
:
str
,
task
):
def
extract_control
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
filename
:
str
,
task
,
provider
:
str
):
try
:
if
not
prefix
.
endswith
(
"/"
):
prefix
+=
"/"
key
=
prefix
+
filename
logger
.
info
(
f
"EXTRAYENDO ARCHIVO DE CONTROL DESDE {key}"
)
control
=
get_file_from_key
(
conn_id
,
bucket
,
key
)
control
=
get_file_from_key
(
conn_id
,
bucket
,
key
,
provider
)
if
control
:
str_data
=
str
(
control
.
getvalue
(),
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
data
=
StringIO
(
str_data
)
...
...
@@ -113,14 +113,14 @@ def extract_control(conn_id: str, bucket: str, prefix: str, filename: str, task)
def
extract_scripts
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
source_mask
:
str
,
transform_mask
:
str
,
order_delimiter
:
str
,
procedure_mask
:
str
,
label_tablename
:
str
,
control_params
:
Dict
[
str
,
Any
],
**
kwargs
):
provider
:
str
,
**
kwargs
):
try
:
extract_control
(
control_params
[
"connection_id"
],
control_params
[
"bucket"
],
control_params
[
"prefix"
],
control_params
[
"filename"
],
kwargs
[
'ti'
])
control_params
[
"filename"
],
kwargs
[
'ti'
]
,
provider
)
save_procedure_json
(
JSON_PROCEDURE_PATH
,
kwargs
[
'ti'
])
start_time
=
time
.
time
()
logger
.
info
(
f
"EXTRAYENDO SCRIPTS DESDE {bucket}/{prefix}"
)
scripts
=
get_files_from_prefix
(
conn_id
,
bucket
,
prefix
)
scripts
=
get_files_from_prefix
(
conn_id
,
bucket
,
prefix
,
provider
)
scripts
=
update_sql_commands
(
scripts
,
label_tablename
)
save_commands_to_xcom
(
scripts
,
kwargs
[
'ti'
],
source_mask
,
transform_mask
,
procedure_mask
,
order_delimiter
)
logger
.
debug
(
f
"Script cargados en Xcom: {scripts}"
)
...
...
@@ -163,7 +163,8 @@ def set_dag():
op_kwargs
=
{
'conn_id'
:
scripts_s3
[
"connection_id"
],
'bucket'
:
scripts_s3
[
"bucket"
],
'prefix'
:
scripts_s3
[
"prefix"
],
'source_mask'
:
extract_mask
,
'transform_mask'
:
transform_mask
,
'procedure_mask'
:
procedure_mask
,
'order_delimiter'
:
order_delimiter
,
'label_tablename'
:
conf
[
"label_multiple_select"
],
'control_params'
:
control_s3
},
'label_tablename'
:
conf
[
"label_multiple_select"
],
'control_params'
:
control_s3
,
'provider'
:
conf
[
"cloud_provider"
]},
trigger_rule
=
"all_success"
)
...
...
@@ -184,17 +185,17 @@ def set_dag():
# Creación de grupo de tasks para las extracciones
chunksize
=
conf
[
"chunksize"
]
timezone
=
conf
[
"timezone"
]
extractions
=
extraction
(
source_db
,
intern_db
,
timezone
,
control_s3
,
chunksize
)
extractions
=
extraction
(
source_db
,
intern_db
,
timezone
,
control_s3
,
c
onf
[
"cloud_provider"
],
c
hunksize
)
# Creación de grupo de tasks para las transformaciones
transformations
=
transformation
(
intern_db
,
timezone
,
control_s3
)
transformations
=
transformation
(
intern_db
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
]
)
# Creación de grupo de tasks para la generación y despliegue de archivos resultados
outputs_conf
=
conf
[
"outputs"
]
result
=
generate_and_deploy_results
(
intern_db
,
outputs_conf
,
timezone
,
control_s3
)
result
=
generate_and_deploy_results
(
intern_db
,
outputs_conf
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
]
)
# Creación de tasks de limpiadores
cleaners
=
cleaning
(
intern_db
,
control_s3
)
cleaners
=
cleaning
(
intern_db
,
control_s3
,
conf
[
"cloud_provider"
]
)
sensor_scripts
>>
script_extractor
>>
extractions
>>
transformations
>>
result
>>
cleaners
return
dag
...
...
dags/dag_transformacion_tacomventas_promoresidencial.py
View file @
c22cefd9
...
...
@@ -234,7 +234,7 @@ def set_dag_1():
from
yaml.loader
import
SafeLoader
# Cambiar conf_path dependiendo del ambiente, en prod usando k8 y contenedores usar /opt/airflow/dags/app_conf.yml
# En desarrollo, cualquiera que apunte a su carpeta dags
conf_path
=
"/
op
t/airflow/dags/app_conf.yml"
conf_path
=
"/
roo
t/airflow/dags/app_conf.yml"
with
open
(
conf_path
)
as
f
:
data
=
yaml
.
load
(
f
,
Loader
=
SafeLoader
)
general_cnf
=
data
[
"general"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment