Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
bcom-tp-etl-transformation-pipelines
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
general
bcom-tp-etl-transformation-pipelines
Commits
b1377db9
Commit
b1377db9
authored
Jul 05, 2023
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update 04-07-23. Update DAG process. Initial process completed
parent
39a51800
Changes
22
Show whitespace changes
Inline
Side-by-side
Showing
22 changed files
with
604 additions
and
114 deletions
+604
-114
Cleaning.py
dags/components/Cleaning.py
+61
-0
DatabaseExtraction.py
dags/components/DatabaseOperation/DatabaseExtraction.py
+38
-0
DatabaseLoad.py
dags/components/DatabaseOperation/DatabaseLoad.py
+16
-0
DatabaseTransformation.py
dags/components/DatabaseOperation/DatabaseTransformation.py
+33
-0
Database.py
dags/components/Databases/Database.py
+0
-1
MysqlDataTypeORMEnum.py
dags/components/Databases/Enums/MysqlDataTypeORMEnum.py
+11
-11
OracleDataTypeEnum.py
dags/components/Databases/Enums/OracleDataTypeEnum.py
+1
-0
OracleDataTypeORMEnum.py
dags/components/Databases/Enums/OracleDataTypeORMEnum.py
+3
-2
PostgresDataTypeORMEnum.py
dags/components/Databases/Enums/PostgresDataTypeORMEnum.py
+3
-3
Mysql.py
dags/components/Databases/Mysql.py
+4
-1
Oracle.py
dags/components/Databases/Oracle.py
+16
-2
Postgres.py
dags/components/Databases/Postgres.py
+4
-1
Extractor.py
dags/components/Extractor.py
+41
-36
Generation.py
dags/components/Generation.py
+91
-0
InsumoModel.py
dags/components/Model/InsumoModel.py
+1
-1
S3Route.py
dags/components/S3Route.py
+16
-9
Timezone.py
dags/components/Timezone.py
+34
-0
Transformation.py
dags/components/Transformation.py
+58
-0
Utils.py
dags/components/Utils.py
+66
-21
Xcom.py
dags/components/Xcom.py
+31
-4
dag_conf.yml
dags/dag_conf.yml
+9
-5
dag_transformacion_bcom.py
dags/dag_transformacion_bcom.py
+67
-17
No files found.
dags/components/Cleaning.py
0 → 100644
View file @
b1377db9
from
airflow.utils.task_group
import
TaskGroup
from
airflow.operators.python
import
PythonOperator
from
airflow.models
import
Variable
from
components.Utils
import
select_multiple
from
components.Xcom
import
delete_all_xcom_tasks
from
components.DatabaseOperation.DatabaseTransformation
import
delete_table
import
logging
logger
=
logging
.
getLogger
()
def
clean
(
command
:
str
,
intern_conn
):
engine
=
intern_conn
.
engine
tablename
=
select_multiple
(
command
)[
"tablename"
]
logger
.
info
(
f
"Borrando tabla {tablename}"
)
delete
=
delete_table
(
tablename
,
engine
)
if
delete
:
logger
.
info
(
f
"Borrado correctamente la tabla {tablename}"
)
delete_all_xcom_tasks
()
logger
.
info
(
f
"Borrado todas las variables xcom"
)
def
get_generate_from_xcom
(
**
kwargs
):
final_selects
=
[]
task
=
kwargs
[
'ti'
]
xcom_keys
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
"XCOM-EXTRACTION-NAMES"
)
logger
.
debug
(
xcom_keys
)
for
key
in
xcom_keys
:
if
not
key
.
startswith
(
"SELECT"
):
continue
xcom_outputs
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
logger
.
info
(
f
"Trayendo comandos {xcom_outputs}"
)
for
select
in
xcom_outputs
:
final_selects
.
append
(
select
)
Variable
.
set
(
key
=
'CLEANS'
,
value
=
final_selects
,
serialize_json
=
True
)
def
get_cleaning_task_group
(
db_intern_conn
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"LimpiezaDelProceso"
,
prefix_group_id
=
False
)
as
group
:
init_task
=
PythonOperator
(
task_id
=
"MASTER_CLEANING"
,
python_callable
=
get_generate_from_xcom
,
trigger_rule
=
'all_success'
)
cleaners
=
Variable
.
get
(
'CLEANS'
,
default_var
=
[],
deserialize_json
=
True
)
if
cleaners
:
tasks
=
PythonOperator
.
partial
(
task_id
=
"CLEANERS"
,
python_callable
=
clean
,
op_kwargs
=
{
'intern_conn'
:
db_intern_conn
}
)
.
expand
(
op_args
=
[[
item
]
for
item
in
cleaners
])
init_task
>>
tasks
except
Exception
as
e
:
logger
.
error
(
f
"Error creando taskGroup de limpiadores. {e}"
)
finally
:
return
group
dags/components/DatabaseOperation/DatabaseExtraction.py
0 → 100644
View file @
b1377db9
import
pandas
as
pd
import
logging
logger
=
logging
.
getLogger
()
def
get_steps
(
sql_command
:
str
,
chunksize
:
int
,
connection
,
is_tablename
:
bool
=
False
)
->
int
:
final_steps
=
0
try
:
if
is_tablename
:
count_command
=
f
"SELECT COUNT(*) FROM {sql_command}"
else
:
count_command
=
f
"SELECT COUNT(*) FROM ({sql_command}) BCOM"
with
connection
.
connect
()
as
conn
:
result
=
conn
.
execute
(
count_command
)
.
first
()
if
result
:
total_rows
=
int
(
result
[
0
])
logger
.
info
(
f
"Total de filas: {total_rows}"
)
steps
=
int
(
total_rows
/
chunksize
)
if
steps
>=
final_steps
:
final_steps
=
steps
+
1
except
Exception
as
e
:
logger
.
error
(
f
"Error calculando el total de N° de filas desde el comando: {sql_command}. {e}"
)
finally
:
return
final_steps
def
get_iterator
(
command
:
str
,
chunksize
:
int
,
connection
)
->
iter
:
iterator
=
None
try
:
connection
=
connection
.
execution_options
(
stream_results
=
True
)
iterator
=
pd
.
read_sql
(
command
,
connection
,
index_col
=
None
,
chunksize
=
chunksize
)
iterator
=
iter
(
iterator
)
except
Exception
as
e
:
logger
.
error
(
f
"Error trayendo iterator. {e}"
)
finally
:
return
iterator
dags/components/DatabaseOperation/DatabaseLoad.py
0 → 100644
View file @
b1377db9
import
pandas
as
pd
import
logging
logger
=
logging
.
getLogger
()
def
save_from_dataframe
(
df
:
pd
.
DataFrame
,
tablename
:
str
,
connection
)
->
bool
:
save
=
True
try
:
with
connection
.
connect
()
as
conn
:
df
.
to_sql
(
tablename
,
conn
,
if_exists
=
'append'
,
index
=
False
,
chunksize
=
500
)
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando resultados desde dataframe. {e}"
)
finally
:
return
save
dags/components/DatabaseOperation/DatabaseTransformation.py
0 → 100644
View file @
b1377db9
import
logging
import
time
from
typing
import
List
logger
=
logging
.
getLogger
()
def
execute_transformations
(
commands
:
List
[
str
],
engine
):
try
:
with
engine
.
connect
()
as
connection
:
for
command
in
commands
:
logger
.
debug
(
f
"Ejecutando comando de transformación: {command}"
)
_
=
connection
.
execute
(
command
)
except
Exception
as
e
:
logger
.
error
(
f
"Error ejecutando comando de transformación. {e}"
)
def
delete_table
(
tablename
:
str
,
engine
)
->
bool
:
delete
=
False
try
:
command
=
f
"DROP TABLE {tablename}"
start_time
=
time
.
time
()
with
engine
.
connect
()
as
conn
:
try
:
_
=
conn
.
execute
(
command
)
except
Exception
:
logger
.
error
(
f
"Tabla no encontrada"
)
delete
=
True
logger
.
debug
(
f
"Duración de borrado: {time.time() - start_time}"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error borrando tabla {tablename}. {e}"
)
finally
:
return
delete
dags/components/Database.py
→
dags/components/Database
s/Database
.py
View file @
b1377db9
from
typing
import
List
,
Tuple
from
sqlalchemy
import
Table
,
Column
,
MetaData
from
components.Model.InsumoModel
import
InsumoModel
from
components.Databases.Mysql
import
Mysql
from
components.Databases.Mariadb
import
Mariadb
from
components.Databases.Postgres
import
Postgres
...
...
dags/components/Databases/Enums/MysqlDataTypeORMEnum.py
View file @
b1377db9
from
enum
import
Enum
from
sqlalchemy
import
I
nteger
,
String
,
Float
,
Date
,
TIMESTAMP
,
Numeric
,
CHAR
,
Text
,
Boolean
,
BLOB
,
JSON
,
BigInteger
,
\
D
ateTime
,
Time
,
DECIMAL
from
sqlalchemy
import
I
NTEGER
,
String
,
Float
,
DATE
,
TIMESTAMP
,
NUMERIC
,
CHAR
,
TEXT
,
BLOB
,
JSON
,
BIGINT
,
\
D
ATETIME
,
TIME
,
DECIMAL
class
MysqlDataTypeORMEnum
(
Enum
):
VARCHAR
=
String
TINY_INT
=
I
nteger
INT
=
I
nteger
TEXT
=
T
ext
TINY_INT
=
I
NTEGER
INT
=
I
NTEGER
TEXT
=
T
EXT
DECIMAL
=
DECIMAL
SHORT
=
String
TIMESTAMP
=
TIMESTAMP
JSON
=
JSON
BIGINT
=
B
igInteger
FLOAT
=
Float
DATETIME
=
D
ateTime
DATE
=
D
ate
TIME
=
T
ime
DOUBLE
=
Float
BIGINT
=
B
IGINT
FLOAT
=
NUMERIC
DATETIME
=
D
ATETIME
DATE
=
D
ATE
TIME
=
T
IME
DOUBLE
=
NUMERIC
dags/components/Databases/Enums/OracleDataTypeEnum.py
View file @
b1377db9
...
...
@@ -5,6 +5,7 @@ class OracleDataTypeEnum(Enum):
NUMBER
=
"DB_TYPE_NUMBER"
VARCHAR2
=
"DB_TYPE_VARCHAR"
DATE
=
"DB_TYPE_DATE"
TIMESTAMP
=
"DB_TYPE_TIMESTAMP"
...
...
dags/components/Databases/Enums/OracleDataTypeORMEnum.py
View file @
b1377db9
from
enum
import
Enum
from
sqlalchemy
import
Integer
,
String
,
Float
,
Date
,
TIMESTAMP
,
N
umeric
,
CHAR
,
Text
,
Boolean
,
BLOB
from
sqlalchemy
import
Integer
,
String
,
Float
,
Date
,
TIMESTAMP
,
N
UMERIC
,
CHAR
,
Text
,
Boolean
,
BLOB
class
OracleDataTypeORMEnum
(
Enum
):
NUMBER
=
Integer
NUMBER
=
NUMERIC
VARCHAR2
=
String
DATE
=
Date
TIMESTAMP
=
TIMESTAMP
dags/components/Databases/Enums/PostgresDataTypeORMEnum.py
View file @
b1377db9
from
enum
import
Enum
from
sqlalchemy
import
Integer
,
String
,
Float
,
Date
,
TIMESTAMP
,
N
umeric
,
CHAR
,
Text
,
Boolean
,
BLOB
from
sqlalchemy
import
Integer
,
String
,
Float
,
Date
,
TIMESTAMP
,
N
UMERIC
,
CHAR
,
Text
,
Boolean
,
BLOB
class
PostgresDataTypeORMEnum
(
Enum
):
VARCHAR
=
String
INTEGER
=
Integer
NUMERIC
=
N
umeric
NUMERIC
=
N
UMERIC
TIMESTAMP
=
TIMESTAMP
TEXT
=
Text
BOOLEAN
=
Boolean
BYTEA
=
BLOB
CHAR
=
CHAR
FLOAT
=
Float
FLOAT
=
NUMERIC
DATE
=
Date
INTEGER8
=
Integer
...
...
dags/components/Databases/Mysql.py
View file @
b1377db9
...
...
@@ -44,6 +44,9 @@ class Mysql:
if
field
[
2
]
!=
-
1
:
size
=
int
(
field
[
2
]
/
4
)
try
:
if
not
isinstance
(
field
[
3
],
type
(
None
))
and
field
[
3
]
>
0
:
data_type
=
MysqlDataTypeORMEnum
[
MysqlDataTypeEnum
(
field
[
1
])
.
name
]
.
value
(
precision
=
field
[
2
],
scale
=
field
[
3
])
else
:
data_type
=
MysqlDataTypeORMEnum
[
MysqlDataTypeEnum
(
field
[
1
])
.
name
]
.
value
(
size
)
except
TypeError
:
data_type
=
MysqlDataTypeORMEnum
[
MysqlDataTypeEnum
(
field
[
1
])
.
name
]
.
value
()
...
...
dags/components/Databases/Oracle.py
View file @
b1377db9
...
...
@@ -40,17 +40,31 @@ class Oracle:
for
field
in
fields
:
logger
.
info
(
f
"Attribute: {field}"
)
name
=
field
[
0
]
# Default precision for Integer Oracle : 38
if
not
isinstance
(
field
[
2
],
type
(
None
)):
size
=
int
(
field
[
2
]
/
4
)
try
:
if
not
isinstance
(
field
[
3
],
type
(
None
))
and
field
[
3
]
>
0
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
(
precision
=
field
[
3
],
scale
=
field
[
3
])
elif
not
isinstance
(
field
[
3
],
type
(
None
)):
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
(
precision
=
38
,
scale
=
field
[
3
])
else
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
(
size
)
except
TypeError
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
()
elif
not
isinstance
(
field
[
3
],
type
(
None
))
and
field
[
3
]
<
0
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
(
precision
=
38
,
scale
=
16
)
elif
not
isinstance
(
field
[
3
],
type
(
None
))
and
field
[
3
]
>=
0
:
try
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
(
precision
=
38
)
except
TypeError
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
()
else
:
data_type
=
OracleDataTypeORMEnum
[
OracleDataTypeEnum
(
field
[
1
]
.
name
)
.
name
]
.
value
()
setattr
(
model
,
name
,
Column
(
data_type
))
model
=
model
.
__table__
except
Exception
as
e
:
logger
.
error
(
f
"Error creando modelo dinámico en Oracle con nombre {tablename}. {e}"
)
logger
.
error
(
f
"Error creando modelo dinámico en Oracle con nombre {tablename}. {
type(e)}.{
e}"
)
finally
:
return
model
dags/components/Databases/Postgres.py
View file @
b1377db9
...
...
@@ -44,6 +44,9 @@ class Postgres:
name
=
field
[
0
]
if
field
[
2
]
!=
-
1
:
try
:
if
not
isinstance
(
field
[
3
],
type
(
None
))
and
field
[
3
]
>
0
:
data_type
=
PostgresDataTypeORMEnum
[
PostgresDataTypeEnum
(
field
[
1
])
.
name
]
.
value
(
precision
=
field
[
2
],
scale
=
field
[
3
])
else
:
data_type
=
PostgresDataTypeORMEnum
[
PostgresDataTypeEnum
(
field
[
1
])
.
name
]
.
value
(
field
[
2
])
except
TypeError
:
data_type
=
PostgresDataTypeORMEnum
[
PostgresDataTypeEnum
(
field
[
1
])
.
name
]
.
value
()
...
...
dags/components/Extractor.py
View file @
b1377db9
from
typing
import
List
,
Tuple
from
enums.DatabaseTypeEnum
import
DatabaseTypeEnum
from
components.Utils
import
select_multiple
from
components.DatabaseOperation.DatabaseExtraction
import
get_iterator
,
get_steps
from
components.DatabaseOperation.DatabaseLoad
import
save_from_dataframe
from
airflow.utils.task_group
import
TaskGroup
from
airflow.operators.python
import
PythonOperator
...
...
@@ -11,63 +11,77 @@ import logging
logger
=
logging
.
getLogger
()
def
extract_from_source
(
command
:
str
,
source_conn
,
intern_conn
):
engine
=
source_conn
.
engine
def
extract_from_source
(
command
:
str
,
source_conn
,
intern_conn
,
chunksize
:
int
,
**
kwargs
):
source_engine
=
source_conn
.
engine
command_for_create
=
command
if
source_conn
.
db_type
==
DatabaseTypeEnum
.
ORACLE
.
value
:
command
+=
" OFFSET 1 ROWS FETCH NEXT 1 ROWS ONLY"
command
_for_create
+=
" OFFSET 1 ROWS FETCH NEXT 1 ROWS ONLY"
else
:
command_words
=
command
.
split
(
" "
)
command_words
=
command
_for_create
.
split
(
" "
)
if
command_words
[
-
2
]
.
lower
()
!=
"limit"
:
command
+=
" limit 1"
command
_for_create
+=
" limit 1"
columns
=
[]
with
engine
.
connect
()
as
connection
:
result
=
connection
.
execute
(
command
)
with
source_engine
.
connect
()
as
connection
:
final_command
=
command_for_create
if
final_command
.
replace
(
" "
,
""
)
.
lower
()
.
find
(
"|select"
):
final_command
=
final_command
[
final_command
.
find
(
"select"
):]
result
=
connection
.
execute
(
final_command
)
fields
=
result
.
cursor
.
description
for
field
in
fields
:
name
=
field
[
0
]
type_code
=
field
[
1
]
length
=
field
[
3
]
column
=
(
name
,
type_code
,
length
)
columns
.
append
(
column
)
name
,
type_code
,
length
,
presicion
=
field
[
0
],
field
[
1
],
field
[
3
],
field
[
5
]
columns
.
append
((
name
,
type_code
,
length
,
presicion
))
logger
.
debug
(
f
"Columnas procesadas: {columns}"
)
multiple
=
select_multiple
(
command
)
if
multiple
[
"is_multiple"
]:
pass
else
:
multiple
=
select_multiple
(
command_for_create
)
model
=
source_conn
.
create_model
(
multiple
[
"tablename"
],
columns
)
create
=
intern_conn
.
create_table
(
model
)
print
(
create
)
logger
.
info
(
f
"Creado correctamente la tabla {multiple['tablename']}. Creado?: {create}"
)
# Se tiene que calcular el total de filas de la fuente
if
command
.
replace
(
" "
,
""
)
.
lower
()
.
find
(
"|select"
):
command
=
command
[
command
.
find
(
"select"
):]
steps
=
get_steps
(
command
,
chunksize
,
source_engine
)
# Traemos el iterator
iterator
=
get_iterator
(
command
,
chunksize
,
source_engine
)
logger
.
info
(
f
"Número de pasos para migrar datos: {steps}"
)
for
step
in
range
(
steps
):
dataframe
=
next
(
iterator
)
dataframe
[
"INTERN_ID_BCOM"
]
=
None
logger
.
debug
(
dataframe
)
save
=
save_from_dataframe
(
dataframe
,
multiple
[
"tablename"
],
intern_conn
.
engine
)
if
save
:
logger
.
info
(
f
"Guardado correctamente dataframe en el paso {step+1}"
)
def
get_select_from_xcom
(
**
kwargs
):
final_selects
=
[]
task
=
kwargs
[
'ti'
]
xcom_keys
=
task
.
xcom_pull
(
task_ids
=
"
DAG-BCOM-EXTRACT-SCRIPTS
"
,
key
=
"XCOM-EXTRACTION-NAMES"
)
xcom_keys
=
task
.
xcom_pull
(
task_ids
=
"
SCRIPTS-EXTRACTOR
"
,
key
=
"XCOM-EXTRACTION-NAMES"
)
logger
.
debug
(
xcom_keys
)
for
key
in
xcom_keys
:
if
not
key
.
startswith
(
"SELECT"
):
continue
xcom_selects
=
task
.
xcom_pull
(
task_ids
=
"
DAG-BCOM-EXTRACT-SCRIPTS
"
,
key
=
key
)
xcom_selects
=
task
.
xcom_pull
(
task_ids
=
"
SCRIPTS-EXTRACTOR
"
,
key
=
key
)
logger
.
info
(
f
"Trayendo comandos {xcom_selects}"
)
for
select
in
xcom_selects
:
final_selects
.
append
(
select
)
Variable
.
set
(
key
=
'SELECTS'
,
value
=
final_selects
,
serialize_json
=
True
)
def
get_
task_group
(
db_source_conn
,
db_intern_conn
)
->
TaskGroup
or
None
:
def
get_
extract_task_group
(
db_source_conn
,
db_intern_conn
,
chunksize
:
int
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"ExtraccionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
init_task
=
PythonOperator
(
task_id
=
"MASTER_EXTRACTOR"
,
python_callable
=
get_select_from_xcom
python_callable
=
get_select_from_xcom
,
trigger_rule
=
'all_success'
)
selects
=
Variable
.
get
(
'SELECTS'
,
default_var
=
[],
deserialize_json
=
True
)
if
selects
:
tasks
=
PythonOperator
.
partial
(
task_id
=
"EXTRACTORS"
,
python_callable
=
extract_from_source
,
op_kwargs
=
{
'source_conn'
:
db_source_conn
,
'intern_conn'
:
db_intern_conn
}
op_kwargs
=
{
'source_conn'
:
db_source_conn
,
'intern_conn'
:
db_intern_conn
,
'chunksize'
:
chunksize
}
)
.
expand
(
op_args
=
[[
item
]
for
item
in
selects
])
init_task
>>
tasks
...
...
@@ -76,12 +90,3 @@ def get_task_group(db_source_conn, db_intern_conn) -> TaskGroup or None:
finally
:
return
group
def
create_table
(
db_type
:
str
,
tablename
:
str
,
fields
:
List
[
Tuple
[
str
]])
->
bool
:
create
=
False
try
:
pass
except
Exception
as
e
:
raise
AssertionError
(
f
"Error creando tabla {tablename}. {e}"
)
finally
:
return
create
dags/components/Generation.py
0 → 100644
View file @
b1377db9
from
typing
import
Any
,
Dict
import
os
from
airflow.utils.task_group
import
TaskGroup
from
airflow.operators.python
import
PythonOperator
from
airflow.models
import
Variable
from
components.S3Route
import
save_df_to_s3
from
components.Utils
import
select_multiple
,
create_temp_file
,
delete_temp_dir
from
components.DatabaseOperation.DatabaseExtraction
import
get_iterator
,
get_steps
import
logging
logger
=
logging
.
getLogger
()
def
generate_and_deploy
(
command
:
str
,
intern_conn
,
params
:
Dict
[
str
,
Any
],
timezone
:
str
,
chunksize
=
10000
):
engine
=
intern_conn
.
engine
tablename
=
select_multiple
(
command
)[
"tablename"
]
logger
.
info
(
f
"Generando resultado de la tabla {tablename}"
)
# Creamos el archivo temporal
filename_mask
=
params
[
"filename_mask"
]
file_type
=
params
[
"file_type"
]
pattern
=
params
[
"datetime_pattern"
]
tmp_file
=
create_temp_file
(
filename_mask
,
file_type
,
tablename
,
timezone
,
pattern
)
logger
.
info
(
tmp_file
)
steps
=
get_steps
(
tablename
,
chunksize
,
engine
,
True
)
iterator
=
get_iterator
(
tablename
,
chunksize
,
engine
)
logger
.
info
(
f
"Total de pasos para generar archivo resultado: {steps}"
)
for
step
in
range
(
steps
):
logger
.
debug
(
f
"STEP: {step}"
)
header
=
True
if
step
==
0
else
False
try
:
dataframe
=
next
(
iterator
)
dataframe
=
dataframe
.
drop
(
"INTERN_ID_BCOM"
,
axis
=
1
)
logger
.
debug
(
dataframe
)
dataframe
.
to_csv
(
tmp_file
,
index
=
False
,
mode
=
'a'
,
header
=
header
)
except
StopIteration
:
break
bucket
=
params
[
"s3_params"
][
"bucket"
]
prefix
=
params
[
"s3_params"
][
"prefix"
]
conn_id
=
params
[
"s3_params"
][
"connection_id"
]
if
not
prefix
.
endswith
(
"/"
):
prefix
+=
"/"
file_key
=
prefix
+
tmp_file
[
tmp_file
.
rfind
(
"/"
)
+
1
:]
# Se sube el archivo al S3
logger
.
info
(
f
"Tamaño del archivo a subir: {os.path.getsize(tmp_file)} bytes"
)
save_df_to_s3
(
tmp_file
,
conn_id
,
bucket
,
file_key
,
in_memory
=
False
)
# Se borra el archivo al finalizar el upload
delete_temp_dir
(
tmp_file
)
def
get_generate_from_xcom
(
**
kwargs
):
final_outputs
=
[]
task
=
kwargs
[
'ti'
]
xcom_keys
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
"XCOM-EXTRACTION-NAMES"
)
logger
.
debug
(
xcom_keys
)
for
key
in
xcom_keys
:
if
not
key
.
startswith
(
"SELECT"
):
continue
xcom_outputs
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
logger
.
info
(
f
"Trayendo comandos {xcom_outputs}"
)
for
select
in
xcom_outputs
:
final_outputs
.
append
(
select
)
Variable
.
set
(
key
=
'GENERATES'
,
value
=
final_outputs
,
serialize_json
=
True
)
def
get_generate_task_group
(
db_intern_conn
,
parameters
:
Dict
[
str
,
Any
],
timezone
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"GeneracionyDespliegueDeResultados"
,
prefix_group_id
=
False
)
as
group
:
init_task
=
PythonOperator
(
task_id
=
"MASTER_GENERATION"
,
python_callable
=
get_generate_from_xcom
,
trigger_rule
=
'all_success'
)
outputs
=
Variable
.
get
(
'GENERATES'
,
default_var
=
[],
deserialize_json
=
True
)
if
outputs
:
tasks
=
PythonOperator
.
partial
(
task_id
=
"GENERATORS"
,
python_callable
=
generate_and_deploy
,
op_kwargs
=
{
'intern_conn'
:
db_intern_conn
,
'params'
:
parameters
,
'timezone'
:
timezone
}
)
.
expand
(
op_args
=
[[
item
]
for
item
in
outputs
])
init_task
>>
tasks
except
Exception
as
e
:
logger
.
error
(
f
"Error creando taskGroup de generadores. {e}"
)
finally
:
return
group
dags/components/Model/InsumoModel.py
View file @
b1377db9
...
...
@@ -8,4 +8,4 @@ class InsumoModel(Base):
__abstract__
=
True
INTERN_
UUID_BCOM
=
Column
(
Integer
,
primary_key
=
True
)
INTERN_
ID_BCOM
=
Column
(
Integer
,
primary_key
=
True
,
autoincrement
=
True
)
dags/components/S3Route.py
View file @
b1377db9
...
...
@@ -90,21 +90,28 @@ def get_base_date(conn: str, bucket: str, key: str) -> datetime.date:
return
last_date
def
save_df_to_s3
(
df
:
pd
.
DataFrame
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
delimiter
:
str
=
","
):
def
save_df_to_s3
(
data
:
pd
.
DataFrame
or
str
,
conn
:
str
,
bucket
:
str
,
key
:
str
,
delimiter
:
str
=
","
,
in_memory
:
bool
=
True
):
try
:
logger
.
info
(
f
"SUBIENDO A NUBE KEY {key}"
)
file_type
=
get_type_file
(
key
)
s3_hook
=
S3Hook
(
conn
)
if
file_type
==
FileTypeEnum
.
EXCEL
or
file_type
==
FileTypeEnum
.
OLD_EXCEL
:
if
in_memory
:
with
BytesIO
()
as
buffer
:
with
pd
.
ExcelWriter
(
buffer
,
engine
=
'xlsxwriter'
)
as
writer
:
df
.
to_excel
(
writer
,
index
=
None
)
data
.
to_excel
(
writer
,
index
=
None
)
s3_hook
.
load_bytes
(
buffer
.
getvalue
(),
key
,
bucket
,
True
)
else
:
pass
elif
file_type
==
FileTypeEnum
.
CSV
or
file_type
==
FileTypeEnum
.
TEXT
:
if
in_memory
:
csv_buffer
=
BytesIO
()
df
.
to_csv
(
csv_buffer
,
header
=
True
,
index
=
False
,
sep
=
delimiter
,
na_rep
=
'None'
)
data
.
to_csv
(
csv_buffer
,
header
=
True
,
index
=
False
,
sep
=
delimiter
,
na_rep
=
'None'
)
csv_buffer
.
seek
(
0
)
s3_hook
.
load_bytes
(
csv_buffer
.
getvalue
(),
key
,
bucket
,
True
)
else
:
s3_hook
.
load_file
(
data
,
key
,
bucket
)
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando archivos a S3. key: {key}. {e}"
)
...
...
dags/components/Timezone.py
0 → 100644
View file @
b1377db9
import
pytz
from
datetime
import
datetime
from
dateutil.parser
import
parse
import
logging
logger
=
logging
.
getLogger
()
def
datetime_by_tzone
(
tzone
:
str
,
pattern
:
str
):
offset
=
None
# Algunos casos donde el timezone es de la forma 4:30 y no se encuentra en timezones de pytz (GMT)
if
":"
in
tzone
:
offset
=
tzone
.
split
(
":"
)[
1
]
tzone
=
tzone
.
split
(
":"
)[
0
]
if
"+"
in
tzone
:
tzone
=
tzone
.
replace
(
tzone
[
-
1
],
str
(
int
(
tzone
[
-
1
])
+
1
))
timezones_list
=
pytz
.
all_timezones
tzones
=
[
x
if
tzone
in
x
else
None
for
x
in
timezones_list
]
tzones
=
list
(
filter
(
None
,
tzones
))
server_timezone
=
pytz
.
timezone
(
tzones
[
0
])
logger
.
debug
(
"Zona Horaria : {}"
.
format
(
server_timezone
))
server_time
=
server_timezone
.
localize
(
datetime
.
utcnow
())
current_time
=
parse
(
server_time
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S.
%
f
%
Z'
))
if
offset
:
offset
=
pytz
.
FixedOffset
((
current_time
.
utcoffset
()
.
total_seconds
()
/
60
+
float
(
offset
))
*
-
1
)
offset
=
offset
.
utcoffset
(
datetime
.
utcnow
())
current_time
=
datetime
.
utcnow
()
+
offset
else
:
current_time
=
current_time
.
replace
(
tzinfo
=
None
)
-
current_time
.
utcoffset
()
current_time
=
parse
(
current_time
.
strftime
(
pattern
))
logger
.
debug
(
"Hora actual: {}"
.
format
(
current_time
))
return
current_time
dags/components/Transformation.py
0 → 100644
View file @
b1377db9
from
airflow.utils.task_group
import
TaskGroup
from
airflow.operators.python
import
PythonOperator
from
airflow.models
import
Variable
import
logging
logger
=
logging
.
getLogger
()
def
transformations
(
xcom_commands
:
str
,
intern_conn
):
engine
=
intern_conn
.
engine
script_name
=
xcom_commands
[
0
]
commands
=
xcom_commands
[
1
]
logger
.
info
(
f
"Ejecutando transformaciones del script {script_name}"
)
with
engine
.
connect
()
as
connection
:
for
command
in
commands
:
logger
.
debug
(
f
"Ejecutando comando de transformación: {command}"
)
_
=
connection
.
execute
(
command
)
def
get_trans_from_xcom
(
**
kwargs
):
transforms_per_file
=
[]
final_transforms
=
[]
task
=
kwargs
[
'ti'
]
xcom_keys
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
"XCOM-EXTRACTION-NAMES"
)
for
key
in
xcom_keys
:
if
not
key
.
startswith
(
"TRANSFORM"
):
continue
xcom_transforms
=
task
.
xcom_pull
(
task_ids
=
"SCRIPTS-EXTRACTOR"
,
key
=
key
)
logger
.
info
(
f
"Trayendo comandos {xcom_transforms}"
)
for
transform
in
xcom_transforms
:
final_transforms
.
append
(
transform
)
transforms_per_file
.
append
((
key
,
final_transforms
))
Variable
.
set
(
key
=
'TRANSFORMS'
,
value
=
transforms_per_file
,
serialize_json
=
True
)
def
get_transform_task_group
(
db_intern_conn
)
->
TaskGroup
or
None
:
group
=
None
try
:
with
TaskGroup
(
group_id
=
"TransformacionDeDatos"
,
prefix_group_id
=
False
)
as
group
:
init_task
=
PythonOperator
(
task_id
=
"MASTER_TRANSFORMATION"
,
python_callable
=
get_trans_from_xcom
,
trigger_rule
=
'all_success'
)
transforms
=
Variable
.
get
(
'TRANSFORMS'
,
default_var
=
[],
deserialize_json
=
True
)
if
transforms
:
tasks
=
PythonOperator
.
partial
(
task_id
=
"TRANSFORMATIONS"
,
python_callable
=
transformations
,
op_kwargs
=
{
'intern_conn'
:
db_intern_conn
}
)
.
expand
(
op_args
=
[[
item
]
for
item
in
transforms
])
init_task
>>
tasks
except
Exception
as
e
:
logger
.
error
(
f
"Error creando taskGroup de transformación. {e}"
)
finally
:
return
group
dags/components/Utils.py
View file @
b1377db9
from
typing
import
List
,
Any
,
Dict
,
Tuple
import
uuid
import
os
import
shutil
import
pandas
as
pd
from
enums.CatalogConfigurationEnum
import
CatalogConfigurationEnum
from
enums.FileTypeEnum
import
FileTypeEnum
from
enums.CommentsScriptEnum
import
CommentsScriptEnum
from
components.Timezone
import
datetime_by_tzone
import
logging
...
...
@@ -83,50 +86,92 @@ def update_dict_with_catalogs(data_dict: Dict[str, Any], data: Dict[str, Any], c
return
data_dict
def
update_sql_commands
(
dataset
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
List
[
str
]]]:
def
update_sql_commands
(
dataset
:
List
[
Tuple
[
str
,
str
]]
,
label_tablename
:
str
)
->
List
[
Tuple
[
str
,
List
[
str
]]]:
result
=
[]
comments
=
[
CommentsScriptEnum
[
item
]
.
value
for
item
in
CommentsScriptEnum
.
_member_names_
]
try
:
for
row
in
dataset
:
data
=
row
[
1
]
.
split
(
"
\n
"
)
data
=
[
item
.
replace
(
"
\r
"
,
""
)
.
replace
(
";"
,
""
)
for
item
in
data
]
data
=
[
item
for
item
in
data
if
item
!=
""
and
item
[:
2
]
not
in
comments
]
result
.
append
((
row
[
0
],
data
))
data
=
row
[
1
]
.
split
(
";"
)
data
=
[
item
.
replace
(
"
\r
"
,
""
)
.
replace
(
";"
,
""
)
.
replace
(
"
\n
"
,
" "
)
for
item
in
data
]
add_next
=
False
final_data
=
[]
table_name
=
""
for
item
in
data
:
final_item
=
item
if
item
.
lower
()
.
strip
()
.
find
(
label_tablename
.
lower
()
.
strip
()
+
":"
)
!=
-
1
:
init_index
=
item
.
lower
()
.
strip
()
.
index
(
label_tablename
.
lower
()
.
strip
()
+
":"
)
table_name
=
item
.
replace
(
" "
,
""
)
.
strip
()[
init_index
+
5
:]
.
strip
()
add_next
=
True
elif
item
!=
""
:
if
add_next
:
item
=
table_name
+
"|"
+
item
add_next
=
False
final_item
=
item
.
strip
()
table_name
=
""
if
final_item
.
strip
()[:
2
]
in
comments
and
(
"update "
in
final_item
.
lower
()
or
"delete "
in
final_item
.
lower
()):
trans_index
=
final_item
.
lower
()
.
index
(
"update"
)
final_item
=
final_item
[
trans_index
:]
final_data
.
append
(
final_item
)
final_data
=
[
item
.
replace
(
"
\t
"
,
""
)
for
item
in
final_data
if
item
!=
""
and
(
"select"
in
item
or
"update"
in
item
or
"delete"
in
item
)]
result
.
append
((
row
[
0
],
final_data
))
logger
.
info
(
f
"Lista de comandos: {result}"
)
except
Exception
as
e
:
raise
AssertionError
(
f
"Error extrayendo comandos sql. {e}"
)
finally
:
return
result
def
get_selects_from_xcom
(
task
)
->
List
[
str
]:
results
=
[]
try
:
selects
=
task
.
xcom_pull
(
key
=
None
,
task_ids
=
"DAG-BCOM-EXTRACT-SCRIPTS"
)
print
(
selects
)
except
Exception
as
e
:
raise
AssertionError
(
f
"Error obteniendo Xcom. {e}"
)
finally
:
return
results
def
select_multiple
(
command
:
str
)
->
Dict
[
str
,
Any
]:
response
=
{
'is_multiple'
:
False
,
'tablename'
:
''
}
tablename
=
""
try
:
if
"join"
in
command
.
lower
()
:
if
command
.
lower
()
.
replace
(
" "
,
""
)
.
find
(
"|select"
)
!=
-
1
:
response
[
"is_multiple"
]
=
True
tablename
=
command
[:
command
.
index
(
"|"
)]
.
strip
()
init_index
=
command
.
lower
()
.
find
(
"from"
)
if
init_index
==
-
1
:
raise
AssertionError
(
"Query malformed"
)
else
:
from_command
=
command
[
init_index
+
4
:]
tablename_base
=
from_command
.
strip
()
.
split
(
" "
)
if
len
(
tablename_base
)
>
0
:
if
len
(
tablename_base
)
>
0
and
tablename
==
""
:
tablename
=
tablename_base
[
0
]
else
:
raise
AssertionError
(
"Query malformed"
)
response
[
"tablename"
]
=
tablename
except
Exception
as
e
:
raise
AssertionError
(
f
"Error validando si es múltiple select y nombre de tabla. {e}"
)
finally
:
return
response
def
create_temp_file
(
filename_mask
:
str
,
file_type
:
str
,
tablename
:
str
,
timezone
:
str
,
pattern
:
str
)
->
str
:
""" Create an output result as a file with a mask in the name
"""
fullpath
=
""
try
:
dir_name
=
str
(
uuid
.
uuid4
())
path_dir
=
"/tmp/"
+
dir_name
os
.
mkdir
(
path_dir
)
current_datetime
=
str
(
datetime_by_tzone
(
timezone
,
pattern
))
filename_mask
=
filename_mask
.
replace
(
"<source_name>"
,
tablename
)
filename_mask
=
filename_mask
.
replace
(
"<datetime>"
,
current_datetime
)
filename
=
filename_mask
+
"."
+
file_type
fullpath
=
path_dir
+
"/"
+
filename
open
(
fullpath
,
mode
=
'a'
)
.
close
()
except
Exception
as
e
:
logger
.
error
(
f
"Error creando directorio y archivo temporal.{e}"
)
finally
:
return
fullpath
def
delete_temp_dir
(
module_name
:
str
)
->
bool
:
drop
=
False
try
:
if
os
.
path
.
exists
(
module_name
):
directory
=
os
.
path
.
dirname
(
module_name
)
shutil
.
rmtree
(
directory
,
ignore_errors
=
True
)
except
Exception
as
e
:
raise
AssertionError
(
f
"Error borrando modulo temporal. {e}"
)
finally
:
return
drop
dags/components/Xcom.py
View file @
b1377db9
from
airflow.utils.db
import
provide_session
from
airflow.models
import
XCom
import
logging
from
typing
import
List
,
Tuple
logger
=
logging
.
getLogger
()
def
save_to_xcom
(
dataset
:
List
[
Tuple
[
str
,
List
[
str
]]],
task
,
extract_mask
:
str
,
transform_mask
:
str
,
def
save_
commands_
to_xcom
(
dataset
:
List
[
Tuple
[
str
,
List
[
str
]]],
task
,
extract_mask
:
str
,
transform_mask
:
str
,
order_delimiter
:
str
)
->
None
:
final_names_xcom
=
[]
try
:
...
...
@@ -27,4 +28,30 @@ def save_to_xcom(dataset: List[Tuple[str, List[str]]], task, extract_mask: str,
final_names_xcom
.
append
(
name
)
task
.
xcom_push
(
key
=
"XCOM-EXTRACTION-NAMES"
,
value
=
final_names_xcom
)
except
Exception
as
e
:
raise
AssertionError
(
f
"Error guardando variables en xcom. {e}"
)
raise
AssertionError
(
f
"Error guardando comandos en variables en xcom. {e}"
)
def
save_names_to_xcom
(
value
:
str
,
task
,
task_name
:
str
,
key
:
str
)
->
None
:
final_names_xcom
=
[]
try
:
xcom_names
=
task
.
xcom_pull
(
task_ids
=
task_name
,
key
=
key
)
if
not
xcom_names
:
final_names_xcom
.
append
(
value
)
else
:
final_names_xcom
=
xcom_names
final_names_xcom
.
append
(
value
)
task
.
xcom_push
(
key
=
key
,
value
=
final_names_xcom
)
logger
.
info
(
f
"Guardado correctamente los nombres de las tablas"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando nombres en variables xcom. {e}"
)
def
delete_all_xcom_tasks
()
->
None
:
try
:
@
provide_session
def
cleanup_xcom
(
session
=
None
):
session
.
query
(
XCom
)
.
filter
(
XCom
.
task_id
==
"SCRIPTS-EXTRACTORS"
)
.
delete
()
cleanup_xcom
()
except
Exception
as
e
:
logger
.
error
(
f
"Error borrando todas las variables xcom del DAG actual. {e}"
)
dags/dag_conf.yml
View file @
b1377db9
...
...
@@ -8,7 +8,7 @@ app:
port
:
21521
username
:
PRUEBABCOM2
password
:
admin
database
:
airflow
database
:
bd_tp_qa
service
:
ORCLPDB1
schema
:
public
transformation
:
...
...
@@ -20,6 +20,8 @@ app:
database
:
prueba_bcom
service
:
schema
:
chunksize
:
50000
label_multiple_select
:
TABLE
source_mask
:
select
transformation_mask
:
transform
prefix_order_delimiter
:
.
...
...
@@ -28,11 +30,13 @@ app:
bucket
:
prueba1234568
prefix
:
bcom_scripts
connection_id
:
conn_script
timezone
:
'
GMT-5'
outputs
:
filename_mask
:
<source_name>_<date>_<time>
filename_mask
:
<source_name>_<datetime>
datetime_pattern
:
'
%Y-%m-%d
%H:%M:%S'
file_type
:
txt
s3_params
:
bucket
:
ejemplo
prefix
:
bcom
/inpu
ts
connection_id
:
conn_
resul
t
bucket
:
prueba1234568
prefix
:
bcom
_resul
ts
connection_id
:
conn_
scrip
t
dags/dag_transformacion_bcom.py
View file @
b1377db9
from
datetime
import
datetime
,
timedelta
from
datetime
import
datetime
import
time
from
typing
import
Any
,
Dict
from
airflow
import
DAG
from
airflow.operators.python
import
PythonOperator
from
airflow.utils.task_group
import
TaskGroup
from
components.Utils
import
update_sql_commands
,
get_selects_from_xcom
from
components.Xcom
import
save_to_xcom
from
components.Utils
import
update_sql_commands
from
components.Xcom
import
save_
commands_
to_xcom
from
components.S3Route
import
get_files_from_prefix
from
components.Sensor
import
create_s3_sensor
from
components.Extractor
import
get_task_group
from
components.Extractor
import
get_extract_task_group
from
components.Transformation
import
get_transform_task_group
from
components.Generation
import
get_generate_task_group
from
components.Cleaning
import
get_cleaning_task_group
from
components.Database
import
Database
from
components.Database
s.Database
import
Database
import
logging
logger
=
logging
.
getLogger
()
DAG_NAME
=
"BCOM_DAG_TRANSFORMACIONES"
DAG_NAME
=
"BCOM_DAG_TRANSFORMACIONES2"
# TASK_NAMES = {'SENSORS': 'SCRIPTS_SENSORS', 'SCRIPT-EXTRACTION': 'SCRIPTS_EXTRACTOR', 'EXTRACTION': 'ExtraccionDeDatos',
# 'TRANSFORMATION': 'TransformacionDeDatos', ''}
DEFAULT_ARGS
=
{
'owner'
:
'BCOM'
,
"start_date"
:
datetime
(
2023
,
5
,
25
,
22
,
9
),
'depends_on_past'
:
False
,
'email'
:
'caguirre@bytesw.com'
,
'retries'
:
1
,
'retries'
:
0
,
'email_on_retry'
:
False
,
'email_on_failure'
:
False
}
def
extraction
(
source_conn
,
intern_conn
)
->
TaskGroup
:
def
cleaning
(
intern_conn
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_task_group
(
source_conn
,
intern_conn
)
groups
=
get_cleaning_task_group
(
intern_conn
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
finally
:
return
groups
def
generate_and_deploy_results
(
intern_conn
,
parameters
:
Dict
[
str
,
Any
],
timezone
:
str
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_generate_task_group
(
intern_conn
,
parameters
,
timezone
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de creación y despliegue de resultados. {e}"
)
finally
:
return
groups
def
transformation
(
intern_conn
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_transform_task_group
(
intern_conn
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de transformación de datos. {e}"
)
finally
:
return
groups
def
extraction
(
source_conn
,
intern_conn
,
chunksize
:
int
=
100000
)
->
TaskGroup
:
groups
=
None
try
:
groups
=
get_extract_task_group
(
source_conn
,
intern_conn
,
chunksize
)
except
Exception
as
e
:
logger
.
error
(
f
"Error general de extracción de datos. {e}"
)
finally
:
...
...
@@ -39,13 +76,13 @@ def extraction(source_conn, intern_conn) -> TaskGroup:
def
extract_scripts
(
conn_id
:
str
,
bucket
:
str
,
prefix
:
str
,
source_mask
:
str
,
transform_mask
:
str
,
order_delimiter
:
str
,
**
kwargs
):
order_delimiter
:
str
,
label_tablename
:
str
,
**
kwargs
):
try
:
start_time
=
time
.
time
()
logger
.
info
(
f
"EXTRAYENDO SCRIPTS DESDE {bucket}/{prefix}"
)
scripts
=
get_files_from_prefix
(
conn_id
,
bucket
,
prefix
)
scripts
=
update_sql_commands
(
scripts
)
save_to_xcom
(
scripts
,
kwargs
[
'ti'
],
source_mask
,
transform_mask
,
order_delimiter
)
scripts
=
update_sql_commands
(
scripts
,
label_tablename
)
save_
commands_
to_xcom
(
scripts
,
kwargs
[
'ti'
],
source_mask
,
transform_mask
,
order_delimiter
)
logger
.
debug
(
f
"Script cargados en Xcom: {scripts}"
)
logger
.
info
(
f
"Tiempo del Task de descarga de scripts: {round(time.time() - start_time, 3)} segundos"
)
except
Exception
as
e
:
...
...
@@ -72,17 +109,17 @@ def set_dag():
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"?*"
else
:
wildcard_scripts
=
scripts_s3
[
"prefix"
]
+
"/?*"
sensor_scripts
=
create_s3_sensor
(
"
sensor_for_scripts
"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
sensor_scripts
=
create_s3_sensor
(
"
SCRIPTS-SENSOR
"
,
scripts_s3
[
"connection_id"
],
scripts_s3
[
"bucket"
],
wildcard_scripts
)
extract_mask
=
conf
[
"source_mask"
]
transform_mask
=
conf
[
"transformation_mask"
]
order_delimiter
=
conf
[
"prefix_order_delimiter"
]
script_extractor
=
PythonOperator
(
task_id
=
"
DAG-BCOM-EXTRACT-SCRIPTS
"
,
task_id
=
"
SCRIPTS-EXTRACTOR
"
,
python_callable
=
extract_scripts
,
op_kwargs
=
{
'conn_id'
:
scripts_s3
[
"connection_id"
],
'bucket'
:
scripts_s3
[
"bucket"
],
'prefix'
:
scripts_s3
[
"prefix"
],
'source_mask'
:
extract_mask
,
'transform_mask'
:
transform_mask
,
'order_delimiter'
:
order_delimiter
},
'order_delimiter'
:
order_delimiter
,
'label_tablename'
:
conf
[
"label_multiple_select"
]
},
trigger_rule
=
"all_success"
)
# Source Database configuration. Only we use 1 source
...
...
@@ -99,9 +136,22 @@ def set_dag():
intern_params
[
"service"
],
intern_params
[
"schema"
])
intern_db
.
create_engine
()
extractions
=
extraction
(
source_db
,
intern_db
)
# Creación de grupo de tasks para las extracciones
chunksize
=
conf
[
"chunksize"
]
extractions
=
extraction
(
source_db
,
intern_db
,
chunksize
)
# Creación de grupo de tasks para las transformaciones
transformations
=
transformation
(
intern_db
)
# Creación de grupo de tasks para la generación y despliegue de archivos resultados
outputs_conf
=
conf
[
"outputs"
]
timezone
=
conf
[
"timezone"
]
result
=
generate_and_deploy_results
(
intern_db
,
outputs_conf
,
timezone
)
# Creación de tasks de limpiadores
cleaners
=
cleaning
(
intern_db
)
sensor_scripts
>>
script_extractor
>>
extractions
sensor_scripts
>>
script_extractor
>>
extractions
>>
transformations
>>
result
>>
cleaners
return
dag
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment