Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
bcom-tp-etl-transformation-pipelines
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
general
bcom-tp-etl-transformation-pipelines
Commits
3176aa51
Commit
3176aa51
authored
Jul 30, 2023
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update 30-07-23. Add new 2 DAGS: INFORM_PROCESS and RESET_PROCESS
parent
c22cefd9
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
22 changed files
with
616 additions
and
84 deletions
+616
-84
Cleaning.py
dags/components/Cleaning.py
+10
-6
Control.py
dags/components/Control.py
+38
-6
DatabaseExtraction.py
dags/components/DatabaseOperation/DatabaseExtraction.py
+1
-1
DatabaseTransformation.py
dags/components/DatabaseOperation/DatabaseTransformation.py
+3
-3
Database.py
dags/components/Databases/Database.py
+9
-0
Mysql.py
dags/components/Databases/Mysql.py
+13
-1
Oracle.py
dags/components/Databases/Oracle.py
+11
-0
Postgres.py
dags/components/Databases/Postgres.py
+11
-0
Extractor.py
dags/components/Extractor.py
+17
-12
Generation.py
dags/components/Generation.py
+12
-5
S3Route.py
dags/components/S3Route.py
+71
-11
Timezone.py
dags/components/Timezone.py
+1
-1
Transformation.py
dags/components/Transformation.py
+10
-5
Utils.py
dags/components/Utils.py
+19
-1
Xcom.py
dags/components/Xcom.py
+3
-2
dag_inform_process.py
dags/dag_inform_process.py
+220
-0
dag_reset_process.py
dags/dag_reset_process.py
+133
-0
dag_transformacion_bcom.py
dags/dag_transformacion_bcom.py
+19
-17
dag_transformacion_tacomventas_promoresidencial.py
dags/dag_transformacion_tacomventas_promoresidencial.py
+1
-1
DataTypeOrmEnum.py
dags/enums/DataTypeOrmEnum.py
+2
-2
ProcessStatusEnum.py
dags/enums/ProcessStatusEnum.py
+1
-0
procedure_definition2.json
dags/procedure_definition2.json
+11
-10
No files found.
dags/components/Cleaning.py
View file @
3176aa51
...
@@ -8,7 +8,7 @@ from airflow.decorators import task
...
@@ -8,7 +8,7 @@ from airflow.decorators import task
from
components.Utils
import
select_multiple
from
components.Utils
import
select_multiple
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
from
components.S3Route
import
load_
obj
_to_s3
from
components.S3Route
import
load_
control
_to_s3
from
enums.OperationTypeEnum
import
OperationTypeEnum
from
enums.OperationTypeEnum
import
OperationTypeEnum
import
logging
import
logging
...
@@ -16,7 +16,7 @@ import logging
...
@@ -16,7 +16,7 @@ import logging
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
def
validate_clean
(
control_params
:
Dict
[
str
,
Any
],
provider
:
str
,
**
kwargs
)
->
None
:
def
validate_clean
(
control_params
:
Dict
[
str
,
Any
],
provider
:
str
,
timezone
:
str
,
**
kwargs
)
->
None
:
delete_task_instances
()
delete_task_instances
()
ti
=
kwargs
[
"ti"
]
ti
=
kwargs
[
"ti"
]
conf
=
ti
.
xcom_pull
(
task_ids
=
"VALIDATE_GENERATOR"
,
key
=
"CONTROL-CONFIG"
)
conf
=
ti
.
xcom_pull
(
task_ids
=
"VALIDATE_GENERATOR"
,
key
=
"CONTROL-CONFIG"
)
...
@@ -27,7 +27,7 @@ def validate_clean(control_params: Dict[str, Any], provider: str, **kwargs) -> N
...
@@ -27,7 +27,7 @@ def validate_clean(control_params: Dict[str, Any], provider: str, **kwargs) -> N
prefix
+=
"/"
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_
obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
loaded
=
load_
control_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
,
timezone
)
if
loaded
:
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
delete_all_xcom_tasks
()
...
@@ -58,10 +58,14 @@ def get_cleaners_from_xcom(**kwargs):
...
@@ -58,10 +58,14 @@ def get_cleaners_from_xcom(**kwargs):
final_selects
.
append
(
select
)
final_selects
.
append
(
select
)
logger
.
info
(
f
"Final selects: {final_selects}"
)
logger
.
info
(
f
"Final selects: {final_selects}"
)
Variable
.
set
(
key
=
'CLEANS'
,
value
=
final_selects
,
serialize_json
=
True
)
Variable
.
set
(
key
=
'CLEANS'
,
value
=
final_selects
,
serialize_json
=
True
)
return
[[
item
]
for
item
in
final_selects
]
if
len
(
final_selects
)
>
0
:
return
[[
item
]
for
item
in
final_selects
]
else
:
return
[[
None
]]
def
get_cleaning_task_group
(
db_intern_conn
,
control_s3
:
Dict
[
str
,
Any
],
provider
:
str
)
->
TaskGroup
or
None
:
def
get_cleaning_task_group
(
db_intern_conn
,
control_s3
:
Dict
[
str
,
Any
],
provider
:
str
,
timezone
:
str
)
->
TaskGroup
or
None
:
group
=
None
group
=
None
try
:
try
:
with
TaskGroup
(
group_id
=
"LimpiezaDelProceso"
,
prefix_group_id
=
False
)
as
group
:
with
TaskGroup
(
group_id
=
"LimpiezaDelProceso"
,
prefix_group_id
=
False
)
as
group
:
...
@@ -76,7 +80,7 @@ def get_cleaning_task_group(db_intern_conn, control_s3: Dict[str, Any], provider
...
@@ -76,7 +80,7 @@ def get_cleaning_task_group(db_intern_conn, control_s3: Dict[str, Any], provider
validate_task
=
PythonOperator
(
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_CLEANER"
,
task_id
=
"VALIDATE_CLEANER"
,
python_callable
=
validate_clean
,
python_callable
=
validate_clean
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'provider'
:
provider
},
op_kwargs
=
{
'control_params'
:
control_s3
,
'provider'
:
provider
,
'timezone'
:
timezone
},
trigger_rule
=
'none_skipped'
trigger_rule
=
'none_skipped'
)
)
cleaners
>>
tasks
>>
validate_task
cleaners
>>
tasks
>>
validate_task
...
...
dags/components/Control.py
View file @
3176aa51
from
components.Timezone
import
datetime_by_tzone
from
components.Timezone
import
datetime_by_tzone
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
import
logging
import
logging
from
typing
import
Dict
,
Any
,
List
,
Tuple
import
json
from
io
import
StringIO
from
typing
import
Dict
,
Any
,
List
from
components.S3Route
import
get_file_from_prefix
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
def
get_tasks_from_control
(
conf
:
List
[
Dict
[
str
,
Any
]],
type_task
:
str
)
->
Dict
[
str
,
Any
]:
def
get_tasks_from_control
(
conf
:
List
[
Dict
[
str
,
Any
]],
type_task
:
str
)
->
Dict
[
str
,
Any
]:
response
=
{
'status'
:
ProcessStatusEnum
.
SUCCESS
.
value
,
'tasks'
:
[]}
response
=
{
'status'
:
ProcessStatusEnum
.
SUCCESS
.
value
,
'tasks'
:
[]
,
'reset'
:
False
}
try
:
try
:
conf
=
conf
[
-
1
]
conf
=
conf
[
-
1
]
logger
.
info
(
f
"Último proceso ejecutado: {conf}"
)
logger
.
info
(
f
"Último proceso ejecutado: {conf}"
)
status
=
conf
[
"status"
]
status
=
conf
[
"status"
]
if
"reset_by_user"
in
conf
.
keys
():
response
[
"reset"
]
=
True
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
response
[
"status"
]
=
ProcessStatusEnum
.
FAIL
.
value
response
[
"status"
]
=
ProcessStatusEnum
.
FAIL
.
value
success_tasks
=
conf
[
"tasks"
]
success_tasks
=
conf
[
"tasks"
]
...
@@ -33,18 +38,45 @@ def get_tasks_from_control(conf: List[Dict[str, Any]], type_task: str) -> Dict[s
...
@@ -33,18 +38,45 @@ def get_tasks_from_control(conf: List[Dict[str, Any]], type_task: str) -> Dict[s
def
update_new_process
(
conf
:
List
[
Dict
[
str
,
Any
]],
status
:
str
,
tasks
:
Dict
[
str
,
Any
],
def
update_new_process
(
conf
:
List
[
Dict
[
str
,
Any
]],
status
:
str
,
tasks
:
Dict
[
str
,
Any
],
timezone
:
str
,
delete_last
:
bool
=
False
)
->
List
[
Dict
[
str
,
Any
]]:
timezone
:
str
,
task
,
delete_last
:
bool
=
False
,
frequency
:
str
=
"montly"
)
->
List
[
Dict
[
str
,
Any
]]:
try
:
try
:
print
(
"PREV:"
,
conf
)
format_date
=
"
%
Y-
%
m"
if
frequency
==
"montly"
else
"
%
Y-
%
W"
current_period
=
str
(
datetime_by_tzone
(
timezone
,
format_date
))[:
7
]
processed_period
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
"PROCESSED_PERIOD"
)
if
delete_last
:
if
delete_last
:
print
(
"CONF-TRANS:"
,
conf
)
last_tasks
=
conf
[
-
1
][
"tasks"
]
last_tasks
=
conf
[
-
1
][
"tasks"
]
tasks
.
update
(
last_tasks
)
tasks
.
update
(
last_tasks
)
print
(
"LAST:"
,
tasks
)
conf
.
pop
(
-
1
)
conf
.
pop
(
-
1
)
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
))
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
))
new_process
=
{
"date"
:
current_datetime
,
"status"
:
status
,
"tasks"
:
tasks
}
new_process
=
{
"date"
:
current_datetime
,
"status"
:
status
,
"tasks"
:
tasks
}
conf
.
append
(
new_process
)
if
current_period
==
processed_period
:
conf
.
append
(
new_process
)
else
:
conf
=
[
new_process
]
print
(
"LAS:"
,
conf
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error actualizando archivo de control. {e}"
)
logger
.
error
(
f
"Error actualizando archivo de control. {e}"
)
finally
:
finally
:
return
conf
return
conf
def
extract_last_control
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
provider
:
str
,
timezone
:
str
,
**
kwargs
):
try
:
task
=
kwargs
[
'ti'
]
if
not
prefix
.
endswith
(
"/"
):
prefix
+=
"/"
logger
.
info
(
f
"BUSCANDO Y EXTRAYENDO ARCHIVO DE CONTROL DESDE {prefix}"
)
control
,
control_key
=
get_file_from_prefix
(
conn_id
,
bucket
,
prefix
,
provider
,
timezone
,
task
)
if
control
:
str_data
=
str
(
control
.
getvalue
(),
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
data
=
StringIO
(
str_data
)
if
not
data
:
raise
AssertionError
(
"Archivo de control no tiene un formato correcto"
)
else
:
data
=
json
.
load
(
data
)
task
.
xcom_push
(
key
=
"CONTROL-CONFIG"
,
value
=
[
control_key
,
data
])
else
:
raise
AssertionError
(
"Archivo de control no encontrado"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de descarga de archivo de control. {e}"
)
\ No newline at end of file
dags/components/DatabaseOperation/DatabaseExtraction.py
View file @
3176aa51
...
@@ -8,7 +8,7 @@ def get_steps(sql_command: str, chunksize: int, connection, is_tablename: bool =
...
@@ -8,7 +8,7 @@ def get_steps(sql_command: str, chunksize: int, connection, is_tablename: bool =
final_steps
=
0
final_steps
=
0
try
:
try
:
if
is_tablename
:
if
is_tablename
:
count_command
=
f
'SELECT COUNT(*) FROM
"{sql_command}"
'
count_command
=
f
'SELECT COUNT(*) FROM
{sql_command}
'
else
:
else
:
count_command
=
f
"SELECT COUNT(*) FROM ({sql_command}) BCOM"
count_command
=
f
"SELECT COUNT(*) FROM ({sql_command}) BCOM"
with
connection
.
connect
()
as
conn
:
with
connection
.
connect
()
as
conn
:
...
...
dags/components/DatabaseOperation/DatabaseTransformation.py
View file @
3176aa51
...
@@ -18,13 +18,13 @@ def execute_transformations(commands: List[str], engine):
...
@@ -18,13 +18,13 @@ def execute_transformations(commands: List[str], engine):
def
delete_table
(
tablename
:
str
,
engine
)
->
bool
:
def
delete_table
(
tablename
:
str
,
engine
)
->
bool
:
delete
=
False
delete
=
False
try
:
try
:
command
=
f
'DROP TABLE
"{tablename}"
'
command
=
f
'DROP TABLE
{tablename}
'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
with
engine
.
connect
()
as
conn
:
with
engine
.
connect
()
as
conn
:
try
:
try
:
_
=
conn
.
execute
(
command
)
_
=
conn
.
execute
(
command
)
except
Exception
:
except
Exception
as
e
:
logger
.
error
(
f
"Tabla no encontrada"
)
logger
.
error
(
f
"Tabla no encontrada
. {e}
"
)
delete
=
True
delete
=
True
logger
.
debug
(
f
"Duración de borrado: {time.time() - start_time}"
)
logger
.
debug
(
f
"Duración de borrado: {time.time() - start_time}"
)
except
Exception
as
e
:
except
Exception
as
e
:
...
...
dags/components/Databases/Database.py
View file @
3176aa51
...
@@ -84,3 +84,12 @@ class Database:
...
@@ -84,3 +84,12 @@ class Database:
logger
.
error
(
f
"Error generando comando sql para procedure. Comando: {command}. {e}"
)
logger
.
error
(
f
"Error generando comando sql para procedure. Comando: {command}. {e}"
)
finally
:
finally
:
return
response
return
response
def
get_all_tablenames
(
self
)
->
List
[
str
]:
tablenames
=
[]
try
:
tablenames
=
self
.
factory
.
get_all_tablenames
()
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo los nombres de tablas. {e}"
)
finally
:
return
tablenames
dags/components/Databases/Mysql.py
View file @
3176aa51
from
typing
import
List
,
Tuple
from
typing
import
List
,
Tuple
from
sqlalchemy
import
create_engine
from
sqlalchemy
import
create_engine
from
sqlalchemy.orm
import
Session
from
enums.DatabaseDialectEnum
import
DatabaseDialectEnum
from
enums.DatabaseDialectEnum
import
DatabaseDialectEnum
from
components.Model.InsumoModel
import
InsumoModel
from
components.Model.InsumoModel
import
InsumoModel
from
components.Databases.Enums.MysqlDataTypeEnum
import
MysqlDataTypeEnum
from
components.Databases.Enums.MysqlDataTypeEnum
import
MysqlDataTypeEnum
from
components.Databases.Enums.MysqlDataTypeORMEnum
import
MysqlDataTypeORMEnum
from
components.Databases.Enums.MysqlDataTypeORMEnum
import
MysqlDataTypeORMEnum
from
sqlalchemy
import
Table
,
Column
,
MetaData
from
sqlalchemy
import
Column
import
logging
import
logging
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
...
@@ -69,3 +70,14 @@ class Mysql:
...
@@ -69,3 +70,14 @@ class Mysql:
logger
.
error
(
f
"Error generando comando sql para procedure Mysql. Comando: {command}. {e}"
)
logger
.
error
(
f
"Error generando comando sql para procedure Mysql. Comando: {command}. {e}"
)
finally
:
finally
:
return
response
return
response
def
get_all_tablenames
(
self
)
->
List
[
str
]:
tablenames
=
[]
try
:
command
=
f
"SELECT table_name FROM information_schema.tables WHERE table_schema='{self.database}'"
with
self
.
engine
.
connect
()
as
conn
:
tablenames
=
conn
.
execute
(
command
)
.
all
()
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo los nombres de tablas. {e}"
)
finally
:
return
tablenames
dags/components/Databases/Oracle.py
View file @
3176aa51
...
@@ -92,3 +92,14 @@ class Oracle:
...
@@ -92,3 +92,14 @@ class Oracle:
logger
.
error
(
f
"Error generando comando sql para procedure Oracle. Comando: {command}. {e}"
)
logger
.
error
(
f
"Error generando comando sql para procedure Oracle. Comando: {command}. {e}"
)
finally
:
finally
:
return
response
return
response
def
get_all_tablenames
(
self
)
->
List
[
str
]:
tablenames
=
[]
try
:
command
=
f
"SELECT table_name FROM all_tables WHERE OWNER='{self.user}'"
with
self
.
engine
.
connect
()
as
conn
:
tablenames
=
conn
.
execute
(
command
)
.
all
()
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo los nombres de tablas. {e}"
)
finally
:
return
tablenames
dags/components/Databases/Postgres.py
View file @
3176aa51
...
@@ -70,3 +70,14 @@ class Postgres:
...
@@ -70,3 +70,14 @@ class Postgres:
logger
.
error
(
f
"Error generando comando sql para procedure Postgres. Comando: {command}. {e}"
)
logger
.
error
(
f
"Error generando comando sql para procedure Postgres. Comando: {command}. {e}"
)
finally
:
finally
:
return
response
return
response
def
get_all_tablenames
(
self
)
->
List
[
str
]:
tablenames
=
[]
try
:
command
=
f
"SELECT pg_tables.tablename FROM pg_catalog.pg_tables WHERE schemaname='{self.schema}'"
with
self
.
engine
.
connect
()
as
conn
:
tablenames
=
conn
.
execute
(
command
)
.
all
()
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo los nombres de tablas. {e}"
)
finally
:
return
tablenames
dags/components/Extractor.py
View file @
3176aa51
...
@@ -9,7 +9,7 @@ from components.DatabaseOperation.DatabaseExtraction import get_iterator, get_st
...
@@ -9,7 +9,7 @@ from components.DatabaseOperation.DatabaseExtraction import get_iterator, get_st
from
components.DatabaseOperation.DatabaseLoad
import
save_from_dataframe
from
components.DatabaseOperation.DatabaseLoad
import
save_from_dataframe
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.S3Route
import
load_
obj
_to_s3
from
components.S3Route
import
load_
control
_to_s3
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
enums.OperationTypeEnum
import
OperationTypeEnum
from
enums.OperationTypeEnum
import
OperationTypeEnum
...
@@ -40,7 +40,7 @@ def validate_extractor(control_params: Dict[str, Any], timezone: str, provider:
...
@@ -40,7 +40,7 @@ def validate_extractor(control_params: Dict[str, Any], timezone: str, provider:
for
failed_task
in
failed_tasks
:
for
failed_task
in
failed_tasks
:
task
=
ti
.
xcom_pull
(
task_ids
=
"EXTRACTORS"
,
key
=
failed_task
)[
0
]
task
=
ti
.
xcom_pull
(
task_ids
=
"EXTRACTORS"
,
key
=
failed_task
)[
0
]
final_dict
.
update
({
failed_task
:
task
})
final_dict
.
update
({
failed_task
:
task
})
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
)
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
,
ti
)
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
conn
=
control_params
[
"connection_id"
]
conn
=
control_params
[
"connection_id"
]
bucket
=
control_params
[
"bucket"
]
bucket
=
control_params
[
"bucket"
]
...
@@ -49,7 +49,7 @@ def validate_extractor(control_params: Dict[str, Any], timezone: str, provider:
...
@@ -49,7 +49,7 @@ def validate_extractor(control_params: Dict[str, Any], timezone: str, provider:
prefix
+=
"/"
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_
obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
loaded
=
load_
control_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
,
timezone
)
if
loaded
:
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
delete_all_xcom_tasks
()
...
@@ -83,7 +83,9 @@ def on_success_extractor(context) -> None:
...
@@ -83,7 +83,9 @@ def on_success_extractor(context) -> None:
ti
.
xcom_push
(
key
=
"SUCCESS_TASKS"
,
value
=
task_name
)
ti
.
xcom_push
(
key
=
"SUCCESS_TASKS"
,
value
=
task_name
)
def
extract_from_source
(
command
:
str
,
source_conn
,
intern_conn
,
chunksize
:
int
,
**
kwargs
):
def
extract_from_source
(
command
,
source_conn
,
intern_conn
,
chunksize
:
int
,
**
kwargs
):
if
isinstance
(
command
,
type
(
None
)):
raise
AirflowSkipException
task
=
kwargs
[
'ti'
]
task
=
kwargs
[
'ti'
]
extract_type
=
command
[
0
]
.
split
(
"|"
)[
0
]
extract_type
=
command
[
0
]
.
split
(
"|"
)[
0
]
command
=
command
[
1
]
command
=
command
[
1
]
...
@@ -178,11 +180,14 @@ def get_select_from_xcom(**kwargs):
...
@@ -178,11 +180,14 @@ def get_select_from_xcom(**kwargs):
logger
.
info
(
f
"Trayendo comandos {xcom_selects}"
)
logger
.
info
(
f
"Trayendo comandos {xcom_selects}"
)
for
select
in
xcom_selects
:
for
select
in
xcom_selects
:
tablename
=
select_multiple
(
select
)[
"tablename"
]
tablename
=
select_multiple
(
select
)[
"tablename"
]
if
tasks
[
"status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
tablename
not
in
success_tasks
:
if
tasks
[
"
reset"
]
or
tasks
[
"
status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
tablename
not
in
success_tasks
:
final_selects
.
append
((
key
,
select
,
))
final_selects
.
append
((
key
,
select
,
))
logger
.
info
(
f
"Comandos para la extracción: {final_selects}"
)
logger
.
info
(
f
"Comandos para la extracción: {final_selects}"
)
Variable
.
set
(
key
=
'SELECTS'
,
value
=
final_selects
,
serialize_json
=
True
)
Variable
.
set
(
key
=
'SELECTS'
,
value
=
final_selects
,
serialize_json
=
True
)
return
[[
item
]
for
item
in
final_selects
]
if
len
(
final_selects
)
>
0
:
return
[[
item
]
for
item
in
final_selects
]
else
:
return
[[
None
]]
def
get_extract_task_group
(
db_source_conn
,
db_intern_conn
,
chunksize
:
int
,
timezone
:
str
,
def
get_extract_task_group
(
db_source_conn
,
db_intern_conn
,
chunksize
:
int
,
timezone
:
str
,
...
@@ -192,6 +197,12 @@ def get_extract_task_group(db_source_conn, db_intern_conn, chunksize: int, timez
...
@@ -192,6 +197,12 @@ def get_extract_task_group(db_source_conn, db_intern_conn, chunksize: int, timez
with
TaskGroup
(
group_id
=
"ExtraccionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
with
TaskGroup
(
group_id
=
"ExtraccionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
selects
=
get_select_from_xcom
()
selects
=
get_select_from_xcom
()
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_EXTRACTION"
,
python_callable
=
validate_extractor
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
,
'provider'
:
provider
},
trigger_rule
=
'all_done'
)
tasks
=
PythonOperator
.
partial
(
tasks
=
PythonOperator
.
partial
(
task_id
=
"EXTRACTORS"
,
task_id
=
"EXTRACTORS"
,
python_callable
=
extract_from_source
,
python_callable
=
extract_from_source
,
...
@@ -200,12 +211,6 @@ def get_extract_task_group(db_source_conn, db_intern_conn, chunksize: int, timez
...
@@ -200,12 +211,6 @@ def get_extract_task_group(db_source_conn, db_intern_conn, chunksize: int, timez
on_success_callback
=
on_success_extractor
on_success_callback
=
on_success_extractor
)
.
expand
(
op_args
=
selects
)
)
.
expand
(
op_args
=
selects
)
validate_task
=
PythonOperator
(
task_id
=
"VALIDATE_EXTRACTION"
,
python_callable
=
validate_extractor
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'timezone'
:
timezone
,
'provider'
:
provider
},
trigger_rule
=
'all_done'
)
selects
>>
tasks
>>
validate_task
selects
>>
tasks
>>
validate_task
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error creando taskGroup de extracción. {e}"
)
logger
.
error
(
f
"Error creando taskGroup de extracción. {e}"
)
...
...
dags/components/Generation.py
View file @
3176aa51
...
@@ -8,7 +8,7 @@ from airflow.decorators import task
...
@@ -8,7 +8,7 @@ from airflow.decorators import task
from
airflow.exceptions
import
AirflowSkipException
from
airflow.exceptions
import
AirflowSkipException
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
components.S3Route
import
save_df_to_s3
,
load_
obj
_to_s3
from
components.S3Route
import
save_df_to_s3
,
load_
control
_to_s3
from
components.Utils
import
select_multiple
,
create_temp_file
,
delete_temp_dir
from
components.Utils
import
select_multiple
,
create_temp_file
,
delete_temp_dir
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
...
@@ -37,7 +37,7 @@ def validate_generate(control_params: Dict[str, Any], timezone: str, provider: s
...
@@ -37,7 +37,7 @@ def validate_generate(control_params: Dict[str, Any], timezone: str, provider: s
for
failed_task
in
failed_tasks
:
for
failed_task
in
failed_tasks
:
task
=
ti
.
xcom_pull
(
task_ids
=
"GENERATORS"
,
key
=
failed_task
)[
0
]
task
=
ti
.
xcom_pull
(
task_ids
=
"GENERATORS"
,
key
=
failed_task
)[
0
]
final_dict
.
update
({
failed_task
:
task
})
final_dict
.
update
({
failed_task
:
task
})
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
,
True
)
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
,
ti
,
True
)
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
conn
=
control_params
[
"connection_id"
]
conn
=
control_params
[
"connection_id"
]
bucket
=
control_params
[
"bucket"
]
bucket
=
control_params
[
"bucket"
]
...
@@ -46,7 +46,7 @@ def validate_generate(control_params: Dict[str, Any], timezone: str, provider: s
...
@@ -46,7 +46,7 @@ def validate_generate(control_params: Dict[str, Any], timezone: str, provider: s
prefix
+=
"/"
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_
obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
loaded
=
load_
control_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
,
timezone
)
if
loaded
:
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
delete_all_xcom_tasks
()
...
@@ -60,6 +60,7 @@ def on_failure_generator(context) -> None:
...
@@ -60,6 +60,7 @@ def on_failure_generator(context) -> None:
task_name
=
f
"{ti.task_id}_{ti.map_index}"
task_name
=
f
"{ti.task_id}_{ti.map_index}"
selects
=
Variable
.
get
(
'GENERATES'
,
default_var
=
[],
deserialize_json
=
True
)
selects
=
Variable
.
get
(
'GENERATES'
,
default_var
=
[],
deserialize_json
=
True
)
table
=
selects
[
ti
.
map_index
]
table
=
selects
[
ti
.
map_index
]
table
=
select_multiple
(
table
)[
"tablename"
]
exception
=
str
(
context
[
"exception"
])
exception
=
str
(
context
[
"exception"
])
status
=
ProcessStatusEnum
.
FAIL
.
value
status
=
ProcessStatusEnum
.
FAIL
.
value
task_result
=
{
"description"
:
table
,
"status"
:
status
,
"message"
:
exception
}
task_result
=
{
"description"
:
table
,
"status"
:
status
,
"message"
:
exception
}
...
@@ -72,6 +73,7 @@ def on_success_generator(context) -> None:
...
@@ -72,6 +73,7 @@ def on_success_generator(context) -> None:
task_name
=
f
"{ti.task_id}_{ti.map_index}"
task_name
=
f
"{ti.task_id}_{ti.map_index}"
selects
=
Variable
.
get
(
'GENERATES'
,
default_var
=
[],
deserialize_json
=
True
)
selects
=
Variable
.
get
(
'GENERATES'
,
default_var
=
[],
deserialize_json
=
True
)
table
=
selects
[
ti
.
map_index
]
table
=
selects
[
ti
.
map_index
]
table
=
select_multiple
(
table
)[
"tablename"
]
status
=
ProcessStatusEnum
.
SUCCESS
.
value
status
=
ProcessStatusEnum
.
SUCCESS
.
value
task_result
=
{
"description"
:
table
,
"status"
:
status
,
"message"
:
""
}
task_result
=
{
"description"
:
table
,
"status"
:
status
,
"message"
:
""
}
ti
.
xcom_push
(
key
=
task_name
,
value
=
task_result
)
ti
.
xcom_push
(
key
=
task_name
,
value
=
task_result
)
...
@@ -80,6 +82,8 @@ def on_success_generator(context) -> None:
...
@@ -80,6 +82,8 @@ def on_success_generator(context) -> None:
def
generate_and_deploy
(
command
:
str
,
intern_conn
,
params
:
Dict
[
str
,
Any
],
timezone
:
str
,
def
generate_and_deploy
(
command
:
str
,
intern_conn
,
params
:
Dict
[
str
,
Any
],
timezone
:
str
,
provider
:
str
,
chunksize
=
10000
):
provider
:
str
,
chunksize
=
10000
):
if
isinstance
(
command
,
type
(
None
)):
raise
AirflowSkipException
engine
=
intern_conn
.
engine
engine
=
intern_conn
.
engine
logger
.
debug
(
f
"COMANDO: {command}"
)
logger
.
debug
(
f
"COMANDO: {command}"
)
tablename
=
select_multiple
(
command
)[
"tablename"
]
tablename
=
select_multiple
(
command
)[
"tablename"
]
...
@@ -137,11 +141,14 @@ def get_generate_from_xcom(**kwargs):
...
@@ -137,11 +141,14 @@ def get_generate_from_xcom(**kwargs):
xcom_outputs
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
xcom_outputs
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
logger
.
info
(
f
"Trayendo tablas {xcom_outputs}"
)
logger
.
info
(
f
"Trayendo tablas {xcom_outputs}"
)
for
select
in
xcom_outputs
:
for
select
in
xcom_outputs
:
if
tasks
[
"status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
select
not
in
success_tasks
:
if
tasks
[
"
reset"
]
or
tasks
[
"
status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
select
not
in
success_tasks
:
final_outputs
.
append
(
select
)
final_outputs
.
append
(
select
)
logger
.
info
(
f
"Final outputs: {final_outputs}"
)
logger
.
info
(
f
"Final outputs: {final_outputs}"
)
Variable
.
set
(
key
=
'GENERATES'
,
value
=
final_outputs
,
serialize_json
=
True
)
Variable
.
set
(
key
=
'GENERATES'
,
value
=
final_outputs
,
serialize_json
=
True
)
return
[[
item
]
for
item
in
final_outputs
]
if
len
(
final_outputs
)
>
0
:
return
[[
item
]
for
item
in
final_outputs
]
else
:
return
[[
None
]]
def
get_generate_task_group
(
db_intern_conn
,
parameters
:
Dict
[
str
,
Any
],
control_s3
:
Dict
[
str
,
Any
],
def
get_generate_task_group
(
db_intern_conn
,
parameters
:
Dict
[
str
,
Any
],
control_s3
:
Dict
[
str
,
Any
],
...
...
dags/components/S3Route.py
View file @
3176aa51
...
@@ -9,6 +9,7 @@ from components.Utils import get_type_file
...
@@ -9,6 +9,7 @@ from components.Utils import get_type_file
from
enums.FileTypeEnum
import
FileTypeEnum
from
enums.FileTypeEnum
import
FileTypeEnum
from
enums.ScriptFileTypeEnum
import
ScriptFileTypeEnum
from
enums.ScriptFileTypeEnum
import
ScriptFileTypeEnum
from
enums.ProviderTypeEnum
import
ProviderTypeEnum
from
enums.ProviderTypeEnum
import
ProviderTypeEnum
from
components.Timezone
import
datetime_by_tzone
from
airflow.providers.amazon.aws.hooks.s3
import
S3Hook
from
airflow.providers.amazon.aws.hooks.s3
import
S3Hook
from
airflow.contrib.hooks.gcs_hook
import
GoogleCloudStorageHook
from
airflow.contrib.hooks.gcs_hook
import
GoogleCloudStorageHook
...
@@ -176,35 +177,94 @@ def get_files_from_prefix(conn: str, bucket: str, prefix: str, provider: str) ->
...
@@ -176,35 +177,94 @@ def get_files_from_prefix(conn: str, bucket: str, prefix: str, provider: str) ->
return
result
return
result
def
get_file_from_key
(
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
)
->
Any
:
def
get_file_from_prefix
(
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
,
timezone
:
str
,
task
,
result
=
BytesIO
()
frequency
:
str
=
"montly"
)
->
Any
:
result
,
key_result
=
BytesIO
(),
''
try
:
try
:
format_date
=
"
%
Y-
%
m"
if
frequency
==
"montly"
else
"
%
Y-
%
W"
period
=
str
(
datetime_by_tzone
(
timezone
,
format_date
))[:
7
]
logger
.
info
(
f
"Periodo actual: {period}."
)
files
,
file
=
[],
key
cloud_gcp
=
False
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
conn
)
s3_hook
=
S3Hook
(
conn
)
data
=
s3_hook
.
get_key
(
key
,
bucket
)
files
=
s3_hook
.
list_keys
(
bucket
,
key
)
data
.
download_fileobj
(
result
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
cloud_gcp
=
True
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
result
=
gcp_hook
.
download
(
bucket
,
key
)
if
not
key
.
endswith
(
"/"
):
print
(
"RESULT:"
,
result
)
key
+=
"/"
files
=
gcp_hook
.
list
(
bucket
,
prefix
=
key
)
files_with_period
=
[]
for
file
in
files
:
if
file
.
endswith
(
"/"
):
continue
file_period
=
file
[
file
.
rfind
(
"_"
)
+
1
:
file
.
rfind
(
"."
)]
files_with_period
.
append
((
file
,
file_period
))
files_with_period
.
sort
(
key
=
lambda
x
:
x
[
1
])
if
len
(
files_with_period
)
>
0
:
file
=
files_with_period
[
-
1
][
0
]
task
.
xcom_push
(
key
=
"PROCESSED_PERIOD"
,
value
=
files_with_period
[
-
1
][
1
])
logger
.
info
(
f
"Descargando archivo de control: {file}"
)
if
file
!=
key
:
if
cloud_gcp
:
result
=
gcp_hook
.
download
(
bucket
,
file
)
result
=
BytesIO
(
result
)
else
:
data
=
s3_hook
.
get_key
(
file
,
bucket
)
data
.
download_fileobj
(
result
)
key_result
=
file
except
Exception
as
e
:
except
Exception
as
e
:
result
=
None
result
=
None
logger
.
error
(
f
"Error extrayendo archivo {key}. {e}"
)
logger
.
error
(
f
"Error extrayendo archivo {key}. {e}"
)
finally
:
finally
:
return
result
return
result
,
key_result
def
load_control_to_s3
(
obj
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
,
timezone
:
str
,
frequency
:
str
=
"montly"
)
->
bool
:
load
=
False
try
:
format_date
=
"
%
Y-
%
m"
if
frequency
==
"montly"
else
"
%
Y-
%
W"
period
=
str
(
datetime_by_tzone
(
timezone
,
format_date
))[:
7
]
key
=
key
.
replace
(
"<period>"
,
period
)
load
=
upload_file
(
obj
,
conn
,
bucket
,
key
,
provider
)
except
Exception
as
e
:
logger
.
error
(
f
"Error subiendo archivo de control a bucket {bucket} y key {key}. {e}"
)
finally
:
return
load
def
load_obj_to_s3
(
obj
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
,
replace
=
True
)
->
bool
:
def
load_report_to_s3
(
conn
:
str
,
bucket
:
str
,
key
:
str
,
filename
:
str
,
provider
:
str
,
timezone
:
str
,
pattern
:
str
)
->
bool
:
load
=
False
load
=
False
try
:
try
:
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
pattern
))
key
=
key
.
replace
(
"<datetime>"
,
current_datetime
)
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
conn
)
s3_hook
=
S3Hook
(
conn
)
s3_hook
.
load_
bytes
(
obj
,
key
,
bucket
,
replac
e
)
s3_hook
.
load_
file
(
filename
,
key
,
bucket
,
Tru
e
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
gcp_hook
=
GoogleCloudStorageHook
(
conn
)
gcp_hook
.
upload
(
bucket
,
key
,
data
=
obj
)
gcp_hook
.
upload
(
bucket
,
key
,
filename
)
load
=
True
load
=
True
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error subiendo
archivo de control
a bucket {bucket} y key {key}. {e}"
)
logger
.
error
(
f
"Error subiendo
reporte
a bucket {bucket} y key {key}. {e}"
)
finally
:
finally
:
return
load
return
load
def
upload_file
(
obj
,
connection
:
str
,
bucket
:
str
,
key
:
str
,
provider
:
str
)
->
bool
:
upload
=
False
try
:
if
provider
==
ProviderTypeEnum
.
AMAZON
.
value
or
provider
==
ProviderTypeEnum
.
MINIO
.
value
:
s3_hook
=
S3Hook
(
connection
)
s3_hook
.
load_bytes
(
obj
,
key
,
bucket
,
True
)
elif
provider
==
ProviderTypeEnum
.
GOOGLE
.
value
:
gcp_hook
=
GoogleCloudStorageHook
(
connection
)
gcp_hook
.
upload
(
bucket
,
key
,
data
=
obj
)
upload
=
True
except
Exception
as
e
:
logger
.
error
(
f
"Error subiendo archivo a {provider}. bucket: {bucket}, key: {key}. {e}"
)
finally
:
return
upload
dags/components/Timezone.py
View file @
3176aa51
...
@@ -7,7 +7,7 @@ import logging
...
@@ -7,7 +7,7 @@ import logging
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
def
datetime_by_tzone
(
tzone
:
str
,
pattern
:
str
):
def
datetime_by_tzone
(
tzone
:
str
,
pattern
:
str
=
"
%
Y-
%
m-
%
d"
):
offset
=
None
offset
=
None
# Algunos casos donde el timezone es de la forma 4:30 y no se encuentra en timezones de pytz (GMT)
# Algunos casos donde el timezone es de la forma 4:30 y no se encuentra en timezones de pytz (GMT)
if
":"
in
tzone
:
if
":"
in
tzone
:
...
...
dags/components/Transformation.py
View file @
3176aa51
...
@@ -7,7 +7,7 @@ from airflow.decorators import task
...
@@ -7,7 +7,7 @@ from airflow.decorators import task
from
airflow.exceptions
import
AirflowSkipException
from
airflow.exceptions
import
AirflowSkipException
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.Control
import
get_tasks_from_control
,
update_new_process
from
components.S3Route
import
load_
obj
_to_s3
from
components.S3Route
import
load_
control
_to_s3
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
enums.OperationTypeEnum
import
OperationTypeEnum
from
enums.OperationTypeEnum
import
OperationTypeEnum
...
@@ -34,7 +34,7 @@ def validate_transform(control_params: Dict[str, Any], timezone: str, provider:
...
@@ -34,7 +34,7 @@ def validate_transform(control_params: Dict[str, Any], timezone: str, provider:
for
failed_task
in
failed_tasks
:
for
failed_task
in
failed_tasks
:
task
=
ti
.
xcom_pull
(
task_ids
=
"TRANSFORMATIONS"
,
key
=
failed_task
)[
0
]
task
=
ti
.
xcom_pull
(
task_ids
=
"TRANSFORMATIONS"
,
key
=
failed_task
)[
0
]
final_dict
.
update
({
failed_task
:
task
})
final_dict
.
update
({
failed_task
:
task
})
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
,
True
)
conf
=
update_new_process
(
conf
,
status
,
final_dict
,
timezone
,
ti
,
True
)
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
if
status
==
ProcessStatusEnum
.
FAIL
.
value
:
conn
=
control_params
[
"connection_id"
]
conn
=
control_params
[
"connection_id"
]
bucket
=
control_params
[
"bucket"
]
bucket
=
control_params
[
"bucket"
]
...
@@ -43,7 +43,7 @@ def validate_transform(control_params: Dict[str, Any], timezone: str, provider:
...
@@ -43,7 +43,7 @@ def validate_transform(control_params: Dict[str, Any], timezone: str, provider:
prefix
+=
"/"
prefix
+=
"/"
key
=
prefix
+
control_params
[
"filename"
]
key
=
prefix
+
control_params
[
"filename"
]
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
conf
=
json
.
dumps
(
conf
,
indent
=
2
,
default
=
str
)
loaded
=
load_
obj_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
)
loaded
=
load_
control_to_s3
(
bytes
(
conf
.
encode
()),
conn
,
bucket
,
key
,
provider
,
timezone
)
if
loaded
:
if
loaded
:
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
logger
.
info
(
f
"Cargado correctamente el archivo de control en {key}"
)
delete_all_xcom_tasks
()
delete_all_xcom_tasks
()
...
@@ -76,6 +76,8 @@ def on_success_transform(context) -> None:
...
@@ -76,6 +76,8 @@ def on_success_transform(context) -> None:
def
transformations
(
xcom_commands
:
str
,
intern_conn
):
def
transformations
(
xcom_commands
:
str
,
intern_conn
):
if
isinstance
(
xcom_commands
,
type
(
None
)):
raise
AirflowSkipException
engine
=
intern_conn
.
engine
engine
=
intern_conn
.
engine
script_name
=
xcom_commands
[
0
]
script_name
=
xcom_commands
[
0
]
commands
=
xcom_commands
[
1
]
commands
=
xcom_commands
[
1
]
...
@@ -103,13 +105,16 @@ def get_trans_from_xcom(**kwargs):
...
@@ -103,13 +105,16 @@ def get_trans_from_xcom(**kwargs):
continue
continue
xcom_transforms
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
xcom_transforms
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
logger
.
info
(
f
"Trayendo comandos {xcom_transforms}"
)
logger
.
info
(
f
"Trayendo comandos {xcom_transforms}"
)
if
tasks
[
"status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
key
not
in
success_tasks
:
if
tasks
[
"
reset"
]
or
tasks
[
"
status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
or
key
not
in
success_tasks
:
for
transform
in
xcom_transforms
:
for
transform
in
xcom_transforms
:
final_transforms
.
append
(
transform
)
final_transforms
.
append
(
transform
)
transforms_per_file
.
append
((
key
,
final_transforms
))
transforms_per_file
.
append
((
key
,
final_transforms
))
logger
.
info
(
f
"Scripts para la transformación: {transforms_per_file}"
)
logger
.
info
(
f
"Scripts para la transformación: {transforms_per_file}"
)
Variable
.
set
(
key
=
'TRANSFORMS'
,
value
=
transforms_per_file
,
serialize_json
=
True
)
Variable
.
set
(
key
=
'TRANSFORMS'
,
value
=
transforms_per_file
,
serialize_json
=
True
)
return
[[
item
]
for
item
in
transforms_per_file
]
if
len
(
transforms_per_file
)
>
0
:
return
[[
item
]
for
item
in
transforms_per_file
]
else
:
return
[[
None
]]
def
get_transform_task_group
(
db_intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
],
def
get_transform_task_group
(
db_intern_conn
,
timezone
:
str
,
control_s3
:
Dict
[
str
,
Any
],
...
...
dags/components/Utils.py
View file @
3176aa51
...
@@ -173,7 +173,7 @@ def create_temp_file(tmp_path: str, filename_mask: str, file_type: str, tablenam
...
@@ -173,7 +173,7 @@ def create_temp_file(tmp_path: str, filename_mask: str, file_type: str, tablenam
if
not
tmp_path
.
endswith
(
"/"
):
if
not
tmp_path
.
endswith
(
"/"
):
tmp_path
+=
"/"
tmp_path
+=
"/"
path_dir
=
tmp_path
+
dir_name
path_dir
=
tmp_path
+
dir_name
os
.
m
kdir
(
path_dir
)
os
.
m
akedirs
(
path_dir
)
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
pattern
))
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
pattern
))
filename_mask
=
filename_mask
.
replace
(
"<source_name>"
,
tablename
)
filename_mask
=
filename_mask
.
replace
(
"<source_name>"
,
tablename
)
filename_mask
=
filename_mask
.
replace
(
"<datetime>"
,
current_datetime
)
filename_mask
=
filename_mask
.
replace
(
"<datetime>"
,
current_datetime
)
...
@@ -229,3 +229,21 @@ def generateModel(tablename: str, attributes: List[Dict[str, Any]], indexes: Lis
...
@@ -229,3 +229,21 @@ def generateModel(tablename: str, attributes: List[Dict[str, Any]], indexes: Lis
logger
.
error
(
f
"Error creando modelo dinámico. {e}"
)
logger
.
error
(
f
"Error creando modelo dinámico. {e}"
)
finally
:
finally
:
return
model
return
model
def
delete_temp_dirs
(
tmp_dir
:
str
)
->
bool
:
delete
=
False
try
:
dirs
=
list
(
os
.
listdir
(
tmp_dir
))
for
directory
in
dirs
:
full_path
=
tmp_dir
+
"/"
+
directory
if
os
.
path
.
isdir
(
full_path
):
_
=
delete_temp_dir
(
full_path
)
logger
.
debug
(
f
"Se borró el directorio {directory}"
)
if
not
os
.
path
.
exists
(
tmp_dir
):
os
.
makedirs
(
tmp_dir
)
delete
=
True
except
Exception
as
e
:
logger
.
error
(
f
"Error borrando archivos temporales en {tmp_dir}. {e}"
)
finally
:
return
delete
dags/components/Xcom.py
View file @
3176aa51
...
@@ -55,13 +55,14 @@ def delete_all_xcom_tasks() -> None:
...
@@ -55,13 +55,14 @@ def delete_all_xcom_tasks() -> None:
try
:
try
:
@
provide_session
@
provide_session
def
cleanup_xcom
(
session
=
None
):
def
cleanup_xcom
(
session
=
None
):
session
.
query
(
XCom
)
.
filter
(
or_
(
XCom
.
task_id
==
"SCRIPTS-EXTRACTOR
S
"
,
XCom
.
task_id
==
"EXTRACTORS"
,
session
.
query
(
XCom
)
.
filter
(
or_
(
XCom
.
task_id
==
"SCRIPTS-EXTRACTOR"
,
XCom
.
task_id
==
"EXTRACTORS"
,
XCom
.
task_id
==
"GENERATORS"
,
XCom
.
task_id
==
"TRANSFORMATIONS"
,
XCom
.
task_id
==
"GENERATORS"
,
XCom
.
task_id
==
"TRANSFORMATIONS"
,
XCom
.
task_id
==
"VALIDATE_EXTRACTION"
,
XCom
.
task_id
==
"VALIDATE_GENERATOR"
,
XCom
.
task_id
==
"VALIDATE_EXTRACTION"
,
XCom
.
task_id
==
"VALIDATE_GENERATOR"
,
XCom
.
task_id
==
"VALIDATE_TRANSFORMATION"
))
.
delete
()
XCom
.
task_id
==
"VALIDATE_TRANSFORMATION"
,
XCom
.
task_id
==
"CONTROL-EXTRACTOR"
))
.
delete
()
session
.
query
(
Variable
)
.
filter
(
or_
(
Variable
.
key
==
"SELECTS"
,
Variable
.
key
==
"TRANSFORMS"
,
session
.
query
(
Variable
)
.
filter
(
or_
(
Variable
.
key
==
"SELECTS"
,
Variable
.
key
==
"TRANSFORMS"
,
Variable
.
key
==
"GENERATES"
,
Variable
.
key
==
"CLEANS"
))
.
delete
()
Variable
.
key
==
"GENERATES"
,
Variable
.
key
==
"CLEANS"
))
.
delete
()
delete_task_instances
()
delete_task_instances
()
session
.
query
(
XCom
)
.
filter
(
XCom
.
dag_id
==
"BCOM_DAG_TRANSFORMACIONES4"
)
.
delete
()
cleanup_xcom
()
cleanup_xcom
()
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error borrando todas las variables xcom del DAG actual. {e}"
)
logger
.
error
(
f
"Error borrando todas las variables xcom del DAG actual. {e}"
)
...
...
dags/dag_inform_process.py
0 → 100644
View file @
3176aa51
from
datetime
import
datetime
from
typing
import
Any
,
Dict
from
airflow
import
DAG
import
pandas
as
pd
import
os
import
uuid
from
airflow.operators.python
import
PythonOperator
from
components.Control
import
extract_last_control
from
components.S3Route
import
load_report_to_s3
from
enums.ProcessStatusEnum
import
ProcessStatusEnum
from
components.Utils
import
delete_temp_dir
import
logging
logger
=
logging
.
getLogger
()
DAG_NAME
=
"INFORM_PROCESS"
# Change this path if is deployed in prod or dev
MAIN_PATH
=
"/root/airflow/dags/"
DEFAULT_ARGS
=
{
'owner'
:
'BCOM'
,
"start_date"
:
datetime
(
2023
,
7
,
29
,
22
,
9
),
'depends_on_past'
:
False
,
'email'
:
'caguirre@bytesw.com'
,
'retries'
:
0
,
'email_on_retry'
:
False
,
'email_on_failure'
:
False
}
def
upload_report
(
report_params
:
Dict
[
str
,
Any
],
provider
:
str
,
timezone
:
str
,
**
kwargs
)
->
None
:
try
:
task
=
kwargs
[
"ti"
]
report_path
=
task
.
xcom_pull
(
task_ids
=
"CREATE-REPORT"
,
key
=
"REPORT_PATH"
)
pattern
=
report_params
[
"datetime_pattern"
]
conn
=
report_params
[
"s3_params"
][
"connection_id"
]
bucket
=
report_params
[
"s3_params"
][
"bucket"
]
prefix
=
report_params
[
"s3_params"
][
"prefix"
]
key
=
report_params
[
"s3_params"
][
"filename"
]
if
not
prefix
.
endswith
(
"/"
):
key
=
prefix
+
"/"
+
key
else
:
key
=
prefix
+
key
upload
=
load_report_to_s3
(
conn
,
bucket
,
key
,
report_path
,
provider
,
timezone
,
pattern
)
if
upload
:
logger
.
info
(
f
"Se subio correctamente el reporte a {key}. bucket: {bucket}"
)
delete_temp_dir
(
report_path
)
except
Exception
as
e
:
logger
.
error
(
f
"Error subiendo reporte a . {e}"
)
def
create_report
(
tmp_path
:
str
,
**
kwargs
)
->
None
:
try
:
task
=
kwargs
[
"ti"
]
data
=
task
.
xcom_pull
(
task_ids
=
"GET-DATA-REPORT"
,
key
=
"REPORT-DATA"
)
status
,
execution_date
=
data
[
"PROCESS_STATUS"
],
data
[
"PROCESS_EXECUTION"
]
_
,
_
=
data
.
pop
(
"PROCESS_STATUS"
),
data
.
pop
(
"PROCESS_EXECUTION"
)
dir_name
=
str
(
uuid
.
uuid4
())
if
not
tmp_path
.
endswith
(
"/"
):
tmp_path
+=
"/"
path_dir
=
tmp_path
+
dir_name
os
.
makedirs
(
path_dir
)
excel_tmp_path
=
path_dir
+
"/tmp_excel.xlsx"
with
pd
.
ExcelWriter
(
excel_tmp_path
,
engine
=
"xlsxwriter"
)
as
writer
:
workbook
=
writer
.
book
worksheet
=
workbook
.
add_worksheet
(
"report"
)
worksheet
.
set_zoom
(
90
)
title
=
"Reporte de último proceso ejecutado"
title_format
=
workbook
.
add_format
()
title_format
.
set_font_size
(
20
)
title_format
.
set_font_color
(
"#333333"
)
header
=
f
"Reporte ejecutado el día {execution_date}"
if
status
==
ProcessStatusEnum
.
SUCCESS
.
value
:
status
=
"EXITOSO"
elif
status
==
ProcessStatusEnum
.
FAIL
.
value
:
status
=
"FALLIDO"
elif
status
==
ProcessStatusEnum
.
RESET
.
value
:
status
=
"RESETEADO POR EL USUARIO"
status
=
f
"Estado de último proceso ejecutado: {status}"
header_format
=
workbook
.
add_format
()
header_format
.
set_font_size
(
10
)
header_format
.
set_font_color
(
"#080606"
)
worksheet
.
merge_range
(
'A1:N1'
,
title
,
title_format
)
worksheet
.
merge_range
(
'A2:N2'
,
header
,
header_format
)
worksheet
.
merge_range
(
'A3:N3'
,
status
,
header_format
)
row_format
=
workbook
.
add_format
()
row_format
.
set_font_size
(
8
)
row_format
.
set_font_color
(
"#000000"
)
base_index
=
5
for
index
,
key
in
enumerate
(
data
.
keys
()):
index
=
base_index
+
index
worksheet
.
merge_range
(
'A'
+
str
(
index
)
+
':B'
+
str
(
index
),
key
,
row_format
)
if
data
[
key
][
"TYPE"
]
==
"EXTRACTION"
:
worksheet
.
merge_range
(
'C'
+
str
(
index
)
+
':G'
+
str
(
index
),
f
"TABLA DE EXTRACCIÓN: {data[key]['DESCRIPTION']}"
,
row_format
)
elif
data
[
key
][
"TYPE"
]
==
"TRANSFORMATION"
:
script
=
data
[
key
][
"DESCRIPTION"
]
.
split
(
"|"
)[
1
]
worksheet
.
merge_range
(
'C'
+
str
(
index
)
+
':G'
+
str
(
index
),
f
"SCRIPT DE TRANSFORMACIÓN: {script}"
,
row_format
)
elif
data
[
key
][
"TYPE"
]
==
"GENERATION"
:
worksheet
.
merge_range
(
'C'
+
str
(
index
)
+
':G'
+
str
(
index
),
f
"ARCHIVO GENERADO DESDE LA TABLA: {data[key]['DESCRIPTION']}"
,
row_format
)
worksheet
.
merge_range
(
'H'
+
str
(
index
)
+
':I'
+
str
(
index
),
f
"ESTADO: {data[key]['STATUS']}"
,
row_format
)
worksheet
.
merge_range
(
'J'
+
str
(
index
)
+
':N'
+
str
(
index
),
data
[
key
][
'MESSAGE'
],
row_format
)
task
.
xcom_push
(
key
=
"REPORT_PATH"
,
value
=
excel_tmp_path
)
except
Exception
as
e
:
logger
.
error
(
f
"Error creando reporte. {e}"
)
def
get_data_report
(
**
kwargs
)
->
None
:
try
:
report_data
=
{}
task
=
kwargs
[
'ti'
]
control_config
=
task
.
xcom_pull
(
task_ids
=
"CONTROL-EXTRACTOR"
,
key
=
"CONTROL-CONFIG"
)
control_key
,
control
=
control_config
[
0
],
control_config
[
1
]
if
not
control
:
logger
.
info
(
"Archivo de control no encontrado. No se actualizará ningún archivo de control"
)
else
:
last_process
=
control
[
-
1
]
if
"reset_by_user"
in
last_process
.
keys
():
report_data
[
"PROCESS_EXECUTION"
]
=
ProcessStatusEnum
.
RESET
.
value
else
:
total_tasks
=
[
last_process
[
"tasks"
]]
current_status
=
last_process
[
"status"
]
control
.
reverse
()
for
process
in
control
:
if
process
[
"status"
]
==
ProcessStatusEnum
.
SUCCESS
.
value
:
break
total_tasks
.
append
(
process
[
"tasks"
])
final_key_tasks
,
final_key_desc
,
final_key_message
=
{},
{},
{}
for
tasks
in
total_tasks
:
for
key
in
tasks
.
keys
():
this_status
=
tasks
[
key
][
"status"
]
this_desc
=
tasks
[
key
][
"description"
]
this_message
=
tasks
[
key
][
"message"
]
if
key
in
final_key_tasks
.
keys
():
task_status
=
final_key_tasks
[
key
]
if
this_status
==
ProcessStatusEnum
.
SUCCESS
.
value
and
\
task_status
==
ProcessStatusEnum
.
FAIL
.
value
:
final_key_tasks
.
update
({
key
:
this_status
})
final_key_desc
.
update
({
key
:
this_desc
})
final_key_message
.
update
({
key
:
''
})
else
:
final_key_tasks
.
update
({
key
:
this_status
})
final_key_desc
.
update
({
key
:
this_desc
})
final_key_message
.
update
({
key
:
this_message
})
for
item
in
final_key_tasks
.
keys
():
if
item
.
lower
()
.
startswith
(
"extract"
):
type_task
=
"EXTRACTION"
elif
item
.
lower
()
.
startswith
(
"transform"
):
type_task
=
"TRANSFORMATION"
else
:
type_task
=
"GENERATION"
report_data
.
update
({
item
:
{
"STATUS"
:
final_key_tasks
[
item
],
"TYPE"
:
type_task
,
"DESCRIPTION"
:
final_key_desc
[
item
],
'MESSAGE'
:
final_key_message
[
item
]}})
report_data
.
update
({
"PROCESS_STATUS"
:
current_status
,
"PROCESS_EXECUTION"
:
last_process
[
"date"
]})
task
.
xcom_push
(
key
=
"REPORT-DATA"
,
value
=
report_data
)
logger
.
info
(
f
"Diccionario de datos para el reporte: {report_data}"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general creando reporte. {e}"
)
def
set_dag
():
""" DAG that reset the last process Airflow have"""
import
yaml
from
yaml.loader
import
SafeLoader
# Cambiar conf_path dependiendo del ambiente, en prod usando k8 y contenedores usar /opt/airflow/dags/app_conf.yml
# En desarrollo, cualquiera que apunte a su carpeta dags
conf_path
=
MAIN_PATH
+
"dag_conf.yml"
with
open
(
conf_path
)
as
f
:
data
=
yaml
.
load
(
f
,
Loader
=
SafeLoader
)
logger
.
info
(
f
"CONFIGURACIÓN: {data}"
)
conf
=
data
[
"app"
]
with
DAG
(
DAG_NAME
,
default_args
=
DEFAULT_ARGS
,
description
=
"Proceso que informa del último proceso ejecutado"
,
schedule_interval
=
conf
[
"inform_dag_schedule"
],
tags
=
[
"DAG BCOM - INFORM PROCESS"
],
catchup
=
True
)
as
dag
:
control_s3
=
conf
[
"control"
][
"s3_params"
]
timezone
=
conf
[
"timezone"
]
control_extractor
=
PythonOperator
(
task_id
=
"CONTROL-EXTRACTOR"
,
python_callable
=
extract_last_control
,
op_kwargs
=
{
'conn_id'
:
control_s3
[
"connection_id"
],
'bucket'
:
control_s3
[
"bucket"
],
'prefix'
:
control_s3
[
"prefix"
],
'provider'
:
conf
[
"cloud_provider"
],
'timezone'
:
timezone
},
trigger_rule
=
"all_success"
)
create
=
PythonOperator
(
task_id
=
"GET-DATA-REPORT"
,
python_callable
=
get_data_report
,
op_kwargs
=
{},
trigger_rule
=
"all_success"
)
tmp_dir
=
conf
[
"outputs"
][
"tmp_path"
]
report
=
PythonOperator
(
task_id
=
"CREATE-REPORT"
,
python_callable
=
create_report
,
op_kwargs
=
{
'tmp_path'
:
tmp_dir
},
trigger_rule
=
"all_success"
)
report_params
=
conf
[
"report"
]
upload
=
PythonOperator
(
task_id
=
"UPLOAD-REPORT"
,
python_callable
=
upload_report
,
op_kwargs
=
{
'report_params'
:
report_params
,
'provider'
:
conf
[
"cloud_provider"
],
'timezone'
:
timezone
},
trigger_rule
=
"all_success"
)
control_extractor
>>
create
>>
report
>>
upload
return
dag
globals
()[
"0"
]
=
set_dag
()
dags/dag_reset_process.py
0 → 100644
View file @
3176aa51
from
datetime
import
datetime
from
typing
import
Any
,
Dict
from
airflow
import
DAG
import
json
from
airflow.operators.python
import
PythonOperator
from
components.Databases.Database
import
Database
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
from
components.Xcom
import
delete_all_xcom_tasks
,
delete_task_instances
from
components.S3Route
import
upload_file
from
components.Control
import
extract_last_control
from
components.Utils
import
delete_temp_dirs
import
logging
logger
=
logging
.
getLogger
()
DAG_NAME
=
"RESET_PROCESS"
# Change this path if is deployed in prod or dev
MAIN_PATH
=
"/root/airflow/dags/"
DEFAULT_ARGS
=
{
'owner'
:
'BCOM'
,
"start_date"
:
datetime
(
2023
,
7
,
28
,
22
,
9
),
'depends_on_past'
:
False
,
'email'
:
'caguirre@bytesw.com'
,
'retries'
:
0
,
'email_on_retry'
:
False
,
'email_on_failure'
:
False
}
def
update_control
(
control_params
:
Dict
[
str
,
Any
],
provider
:
str
,
**
kwargs
)
->
None
:
try
:
task
=
kwargs
[
'ti'
]
control_config
=
task
.
xcom_pull
(
task_ids
=
"CONTROL-EXTRACTOR"
,
key
=
"CONTROL-CONFIG"
)
control_key
,
control
=
control_config
[
0
],
control_config
[
1
]
if
not
control
:
logger
.
info
(
"Archivo de control no encontrado. No se actualizará ningún archivo de control"
)
else
:
last_process
=
control
[
-
1
]
last_process
.
update
({
'reset_by_user'
:
True
})
control
.
pop
(
-
1
)
control
.
append
(
last_process
)
control
=
json
.
dumps
(
control
)
.
encode
(
'utf-8'
)
load
=
upload_file
(
control
,
control_params
[
"connection_id"
],
control_params
[
"bucket"
],
control_key
,
provider
)
if
load
:
logger
.
info
(
f
"Subido correctamente el archivo de control en bucket {control_params['bucket']} y"
f
"key: {control_key}"
)
# Borrar las variables que hayan quedado
delete_all_xcom_tasks
()
delete_task_instances
()
except
Exception
as
e
:
logger
.
error
(
f
"Error actualizando archivo de control. {e}"
)
def
reset_process
(
intern_db
,
output_tmp_dir
:
str
)
->
None
:
try
:
# Borrrando tablas
tablenames
=
intern_db
.
get_all_tablenames
()
if
len
(
tablenames
)
==
0
:
logger
.
info
(
"No se encontraron tablas para su limpieza"
)
else
:
for
tablename
in
tablenames
:
tablename
=
tablename
[
0
]
delete
=
delete_table
(
tablename
,
intern_db
.
engine
)
if
delete
:
logger
.
info
(
f
"Borrado correctamente la tabla {tablename}"
)
# Borrando archivos temporales que hayan quedado sueltos
delete
=
delete_temp_dirs
(
output_tmp_dir
)
if
delete
:
logger
.
info
(
"Se borraron todos los archivos temporales"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error procesando archivo de control. {e}"
)
def
set_dag
():
""" DAG that reset the last process Airflow have"""
import
yaml
from
yaml.loader
import
SafeLoader
# Cambiar conf_path dependiendo del ambiente, en prod usando k8 y contenedores usar /opt/airflow/dags/app_conf.yml
# En desarrollo, cualquiera que apunte a su carpeta dags
conf_path
=
MAIN_PATH
+
"dag_conf.yml"
with
open
(
conf_path
)
as
f
:
data
=
yaml
.
load
(
f
,
Loader
=
SafeLoader
)
logger
.
info
(
f
"CONFIGURACIÓN: {data}"
)
conf
=
data
[
"app"
]
with
DAG
(
DAG_NAME
,
default_args
=
DEFAULT_ARGS
,
description
=
"Proceso que resetea el último proceso ejecutado"
,
schedule_interval
=
conf
[
"reset_dag_schedule"
],
tags
=
[
"DAG BCOM - RESET PROCESS"
],
catchup
=
True
)
as
dag
:
control_s3
=
conf
[
"control"
][
"s3_params"
]
timezone
=
conf
[
"timezone"
]
control_extractor
=
PythonOperator
(
task_id
=
"CONTROL-EXTRACTOR"
,
python_callable
=
extract_last_control
,
op_kwargs
=
{
'conn_id'
:
control_s3
[
"connection_id"
],
'bucket'
:
control_s3
[
"bucket"
],
'prefix'
:
control_s3
[
"prefix"
],
'provider'
:
conf
[
"cloud_provider"
],
'timezone'
:
timezone
},
trigger_rule
=
"all_success"
)
# Intern Database configuration.
intern_params
=
conf
[
"database"
][
"transformation"
]
intern_db
=
Database
(
intern_params
[
"type"
],
intern_params
[
"host"
],
int
(
intern_params
[
"port"
]),
intern_params
[
"username"
],
intern_params
[
"password"
],
intern_params
[
"database"
],
intern_params
[
"service"
],
intern_params
[
"schema"
])
intern_db
.
create_engine
()
tmp_dir
=
conf
[
"outputs"
][
"tmp_path"
]
control_process
=
PythonOperator
(
task_id
=
"RESET-PROCESS"
,
python_callable
=
reset_process
,
op_kwargs
=
{
'intern_db'
:
intern_db
,
'output_tmp_dir'
:
tmp_dir
},
trigger_rule
=
"all_success"
)
update
=
PythonOperator
(
task_id
=
"UPDATE-CONTROL"
,
python_callable
=
update_control
,
op_kwargs
=
{
'control_params'
:
control_s3
,
'provider'
:
conf
[
"cloud_provider"
]},
trigger_rule
=
"all_success"
)
control_extractor
>>
control_process
>>
update
return
dag
globals
()[
"0"
]
=
set_dag
()
dags/dag_transformacion_bcom.py
View file @
3176aa51
...
@@ -8,7 +8,7 @@ from airflow.operators.python import PythonOperator
...
@@ -8,7 +8,7 @@ from airflow.operators.python import PythonOperator
from
airflow.utils.task_group
import
TaskGroup
from
airflow.utils.task_group
import
TaskGroup
from
components.Utils
import
update_sql_commands
from
components.Utils
import
update_sql_commands
from
components.Xcom
import
save_commands_to_xcom
from
components.Xcom
import
save_commands_to_xcom
from
components.S3Route
import
get_files_from_prefix
,
get_file_from_
key
from
components.S3Route
import
get_files_from_prefix
,
get_file_from_
prefix
from
components.Sensor
import
create_sensor
from
components.Sensor
import
create_sensor
from
components.Extractor
import
get_extract_task_group
from
components.Extractor
import
get_extract_task_group
from
components.Transformation
import
get_transform_task_group
from
components.Transformation
import
get_transform_task_group
...
@@ -22,10 +22,10 @@ import logging
...
@@ -22,10 +22,10 @@ import logging
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
DAG_NAME
=
"BCOM_DAG_TRANSFORMACIONES
3
"
DAG_NAME
=
"BCOM_DAG_TRANSFORMACIONES
4
"
# Change this path if is deployed in prod or dev
# Change this path if is deployed in prod or dev
MAIN_PATH
=
"/
op
t/airflow/dags/"
MAIN_PATH
=
"/
roo
t/airflow/dags/"
JSON_PROCEDURE_PATH
=
MAIN_PATH
+
"procedure_definition2.json"
JSON_PROCEDURE_PATH
=
MAIN_PATH
+
"procedure_definition2.json"
DEFAULT_ARGS
=
{
DEFAULT_ARGS
=
{
...
@@ -39,10 +39,10 @@ DEFAULT_ARGS = {
...
@@ -39,10 +39,10 @@ DEFAULT_ARGS = {
}
}
def
cleaning
(
intern_conn
,
control_s3
:
Dict
[
str
,
Any
],
provider
:
str
)
->
TaskGroup
:
def
cleaning
(
intern_conn
,
control_s3
:
Dict
[
str
,
Any
],
provider
:
str
,
timezone
:
str
)
->
TaskGroup
:
groups
=
None
groups
=
None
try
:
try
:
groups
=
get_cleaning_task_group
(
intern_conn
,
control_s3
,
provider
)
groups
=
get_cleaning_task_group
(
intern_conn
,
control_s3
,
provider
,
timezone
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
finally
:
finally
:
...
@@ -92,20 +92,22 @@ def save_procedure_json(json_path: str, task) -> None:
...
@@ -92,20 +92,22 @@ def save_procedure_json(json_path: str, task) -> None:
logger
.
error
(
f
"Error leyendo y guardando archivo descriptor de procedure. {e}"
)
logger
.
error
(
f
"Error leyendo y guardando archivo descriptor de procedure. {e}"
)
def
extract_control
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
filename
:
str
,
task
,
provider
:
str
):
def
extract_control
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
task
,
provider
:
str
,
timezone
:
str
):
try
:
try
:
if
not
prefix
.
endswith
(
"/"
):
if
not
prefix
.
endswith
(
"/"
):
prefix
+=
"/"
prefix
+=
"/"
key
=
prefix
+
filename
logger
.
info
(
f
"BUSCANDO Y EXTRAYENDO ARCHIVO DE CONTROL DESDE {prefix}"
)
logger
.
info
(
f
"EXTRAYENDO ARCHIVO DE CONTROL DESDE {key}"
)
control
,
_
=
get_file_from_prefix
(
conn_id
,
bucket
,
prefix
,
provider
,
timezone
,
task
)
control
=
get_file_from_key
(
conn_id
,
bucket
,
key
,
provider
)
if
control
:
if
control
:
str_data
=
str
(
control
.
getvalue
(),
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
str_data
=
str
(
control
.
getvalue
(),
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
data
=
StringIO
(
str_data
)
data
=
StringIO
(
str_data
)
data
=
json
.
load
(
data
)
if
not
data
:
data
=
[{
"status"
:
ProcessStatusEnum
.
SUCCESS
.
value
,
"tasks"
:
{}}]
else
:
data
=
json
.
load
(
data
)
else
:
else
:
logger
.
info
(
f
"Json
Procedure descarg
ado: {control}"
)
logger
.
info
(
f
"Json
de control cre
ado: {control}"
)
data
=
[{
"status"
:
ProcessStatusEnum
.
SUCCESS
.
value
,
"tasks"
:
[]
}]
data
=
[{
"status"
:
ProcessStatusEnum
.
SUCCESS
.
value
,
"tasks"
:
{}
}]
task
.
xcom_push
(
key
=
"CONTROL-CONFIG"
,
value
=
data
)
task
.
xcom_push
(
key
=
"CONTROL-CONFIG"
,
value
=
data
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error general de descarga de archivo de control. {e}"
)
logger
.
error
(
f
"Error general de descarga de archivo de control. {e}"
)
...
@@ -113,10 +115,10 @@ def extract_control(conn_id: str, bucket: str, prefix: str, filename: str, task,
...
@@ -113,10 +115,10 @@ def extract_control(conn_id: str, bucket: str, prefix: str, filename: str, task,
def
extract_scripts
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
source_mask
:
str
,
transform_mask
:
str
,
def
extract_scripts
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
source_mask
:
str
,
transform_mask
:
str
,
order_delimiter
:
str
,
procedure_mask
:
str
,
label_tablename
:
str
,
control_params
:
Dict
[
str
,
Any
],
order_delimiter
:
str
,
procedure_mask
:
str
,
label_tablename
:
str
,
control_params
:
Dict
[
str
,
Any
],
provider
:
str
,
**
kwargs
):
provider
:
str
,
timezone
:
str
,
**
kwargs
):
try
:
try
:
extract_control
(
control_params
[
"connection_id"
],
control_params
[
"bucket"
],
control_params
[
"prefix"
],
extract_control
(
control_params
[
"connection_id"
],
control_params
[
"bucket"
],
control_params
[
"prefix"
],
control_params
[
"filename"
],
kwargs
[
'ti'
],
provider
)
kwargs
[
'ti'
],
provider
,
timezone
)
save_procedure_json
(
JSON_PROCEDURE_PATH
,
kwargs
[
'ti'
])
save_procedure_json
(
JSON_PROCEDURE_PATH
,
kwargs
[
'ti'
])
start_time
=
time
.
time
()
start_time
=
time
.
time
()
logger
.
info
(
f
"EXTRAYENDO SCRIPTS DESDE {bucket}/{prefix}"
)
logger
.
info
(
f
"EXTRAYENDO SCRIPTS DESDE {bucket}/{prefix}"
)
...
@@ -152,6 +154,7 @@ def set_dag():
...
@@ -152,6 +154,7 @@ def set_dag():
sensor_scripts
=
create_sensor
(
"SCRIPTS-SENSOR"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
sensor_scripts
=
create_sensor
(
"SCRIPTS-SENSOR"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
wildcard_scripts
,
conf
[
"cloud_provider"
])
wildcard_scripts
,
conf
[
"cloud_provider"
])
control_s3
=
conf
[
"control"
][
"s3_params"
]
control_s3
=
conf
[
"control"
][
"s3_params"
]
timezone
=
conf
[
"timezone"
]
# Scripts extraction
# Scripts extraction
extract_mask
=
conf
[
"source_mask"
]
extract_mask
=
conf
[
"source_mask"
]
transform_mask
=
conf
[
"transformation_mask"
]
transform_mask
=
conf
[
"transformation_mask"
]
...
@@ -164,7 +167,7 @@ def set_dag():
...
@@ -164,7 +167,7 @@ def set_dag():
'prefix'
:
scripts_s3
[
"prefix"
],
'source_mask'
:
extract_mask
,
'transform_mask'
:
transform_mask
,
'prefix'
:
scripts_s3
[
"prefix"
],
'source_mask'
:
extract_mask
,
'transform_mask'
:
transform_mask
,
'procedure_mask'
:
procedure_mask
,
'order_delimiter'
:
order_delimiter
,
'procedure_mask'
:
procedure_mask
,
'order_delimiter'
:
order_delimiter
,
'label_tablename'
:
conf
[
"label_multiple_select"
],
'control_params'
:
control_s3
,
'label_tablename'
:
conf
[
"label_multiple_select"
],
'control_params'
:
control_s3
,
'provider'
:
conf
[
"cloud_provider"
]},
'provider'
:
conf
[
"cloud_provider"
]
,
'timezone'
:
timezone
},
trigger_rule
=
"all_success"
trigger_rule
=
"all_success"
)
)
...
@@ -184,7 +187,6 @@ def set_dag():
...
@@ -184,7 +187,6 @@ def set_dag():
# Creación de grupo de tasks para las extracciones
# Creación de grupo de tasks para las extracciones
chunksize
=
conf
[
"chunksize"
]
chunksize
=
conf
[
"chunksize"
]
timezone
=
conf
[
"timezone"
]
extractions
=
extraction
(
source_db
,
intern_db
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
],
chunksize
)
extractions
=
extraction
(
source_db
,
intern_db
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
],
chunksize
)
# Creación de grupo de tasks para las transformaciones
# Creación de grupo de tasks para las transformaciones
...
@@ -195,7 +197,7 @@ def set_dag():
...
@@ -195,7 +197,7 @@ def set_dag():
result
=
generate_and_deploy_results
(
intern_db
,
outputs_conf
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
])
result
=
generate_and_deploy_results
(
intern_db
,
outputs_conf
,
timezone
,
control_s3
,
conf
[
"cloud_provider"
])
# Creación de tasks de limpiadores
# Creación de tasks de limpiadores
cleaners
=
cleaning
(
intern_db
,
control_s3
,
conf
[
"cloud_provider"
])
cleaners
=
cleaning
(
intern_db
,
control_s3
,
conf
[
"cloud_provider"
]
,
timezone
)
sensor_scripts
>>
script_extractor
>>
extractions
>>
transformations
>>
result
>>
cleaners
sensor_scripts
>>
script_extractor
>>
extractions
>>
transformations
>>
result
>>
cleaners
return
dag
return
dag
...
...
dags/dag_transformacion_tacomventas_promoresidencial.py
View file @
3176aa51
...
@@ -310,5 +310,5 @@ def set_dag_1():
...
@@ -310,5 +310,5 @@ def set_dag_1():
return
dag
return
dag
globals
()[
"0"
]
=
set_dag_1
()
#
globals()["0"] = set_dag_1()
dags/enums/DataTypeOrmEnum.py
View file @
3176aa51
import
enum
import
enum
from
sqlalchemy
import
Integer
,
String
,
Date
,
DateTime
,
DECIMAL
,
BOOLEAN
,
T
EXT
from
sqlalchemy
import
Integer
,
String
,
Date
,
DateTime
,
DECIMAL
,
BOOLEAN
,
T
ext
class
DataTypeOrmEnum
(
enum
.
Enum
):
class
DataTypeOrmEnum
(
enum
.
Enum
):
...
@@ -9,4 +9,4 @@ class DataTypeOrmEnum(enum.Enum):
...
@@ -9,4 +9,4 @@ class DataTypeOrmEnum(enum.Enum):
DATE
=
Date
DATE
=
Date
DATETIME
=
DateTime
DATETIME
=
DateTime
BOOLEAN
=
BOOLEAN
BOOLEAN
=
BOOLEAN
LONGTEXT
=
T
EXT
LONGTEXT
=
T
ext
dags/enums/ProcessStatusEnum.py
View file @
3176aa51
...
@@ -4,3 +4,4 @@ from enum import Enum
...
@@ -4,3 +4,4 @@ from enum import Enum
class
ProcessStatusEnum
(
Enum
):
class
ProcessStatusEnum
(
Enum
):
SUCCESS
=
"success"
SUCCESS
=
"success"
FAIL
=
"failed"
FAIL
=
"failed"
RESET
=
"reset"
dags/procedure_definition2.json
View file @
3176aa51
[
[
{
{
"identifier"
:
"
TABLA1
"
,
"identifier"
:
"
tabla2
"
,
"fields"
:
[
"fields"
:
[
{
{
"name"
:
"column
a
1"
,
"name"
:
"column1"
,
"datatype"
:
"TEXT"
,
"datatype"
:
"TEXT"
,
"maxLength"
:
5
0
"maxLength"
:
10
0
},
},
{
{
"name"
:
"column
a
2"
,
"name"
:
"column2"
,
"datatype"
:
"NUMBER"
"datatype"
:
"NUMBER"
},
},
{
{
"name"
:
"column
a
3"
,
"name"
:
"column3"
,
"datatype"
:
"
BOOLEAN
"
"datatype"
:
"
NUMBER
"
},
},
{
{
"name"
:
"column
a
4"
,
"name"
:
"column4"
,
"datatype"
:
"
NUMBER
"
"datatype"
:
"
DATE
"
},
},
{
{
"name"
:
"columna5"
,
"name"
:
"column5"
,
"datatype"
:
"DECIMAL"
"datatype"
:
"DECIMAL"
,
"decimal_precision"
:
5
}
}
]
]
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment