Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
BCOM-Components-Innovation-Tests
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Proyectos-Innovacion-2024
BCOM-Components-Innovation-Tests
Commits
805f468e
Commit
805f468e
authored
Apr 02, 2024
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix bug commission
parent
a78f4f94
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
242 additions
and
125 deletions
+242
-125
DatabaseTypeEnum.py
Enum/DatabaseTypeEnum.py
+8
-0
CommissionProcess.py
Pipeline/CommissionProcess.py
+84
-59
ETLProcess.py
Pipeline/ETLProcess.py
+39
-16
SparkUtils.py
Utils/SparkUtils.py
+12
-9
commission.py
commission.py
+7
-2
commission_2.py
commission_2.py
+12
-8
config.json
config.json
+18
-10
config2.json
config2.json
+10
-2
etl.py
etl.py
+34
-14
etl_2.py
etl_2.py
+18
-5
No files found.
Enum/DatabaseTypeEnum.py
0 → 100644
View file @
805f468e
from
enum
import
Enum
class
DatabaseTypeEnum
(
Enum
):
MONGODB
=
"mongodb"
MYSQL
=
"mysql"
REDSHIFT
=
"redshift"
STARROKS
=
"starroks"
Pipeline/CommissionProcess.py
View file @
805f468e
This diff is collapsed.
Click to expand it.
Pipeline/ETLProcess.py
View file @
805f468e
from
typing
import
Dict
,
Any
import
logging
from
pyspark.sql.functions
import
col
,
when
,
lit
,
to_date
,
date_format
,
date_add
from
pyspark.sql.types
import
StructType
,
StructField
,
StringType
from
prefect
import
task
from
Enum.DataTypeEnum
import
DataTypeEnum
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
from
Utils.SparkUtils
import
createSession
from
Input.Source
import
Input
...
...
@@ -20,8 +21,8 @@ class ETLProcess:
self
.
inputs
=
{}
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
])
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
)
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
]
,
source_type
:
InputTypeEnum
=
InputTypeEnum
.
BUCKET
)
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
,
source_type
)
@
task
def
reader
(
self
)
->
None
:
...
...
@@ -124,19 +125,41 @@ class ETLProcess:
return
success
@
task
def
write
(
self
,
identifier
:
str
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
prev_status
:
bool
=
True
)
->
None
:
def
write
(
self
,
identifier
:
str
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
prev_status
:
bool
=
True
,
db_type
:
DatabaseTypeEnum
=
DatabaseTypeEnum
.
REDSHIFT
,
redshift_url
:
str
=
""
,
mysql_url
:
str
=
""
)
->
None
:
try
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
self
.
inputs
[
identifier
]
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
if
db_type
==
DatabaseTypeEnum
.
REDSHIFT
:
self
.
inputs
[
identifier
]
.
coalesce
(
45
)
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.amazon.redshift.jdbc42.Driver"
)
\
.
option
(
"url"
,
redshift_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
"awsuser"
)
\
.
option
(
"password"
,
"Awsuser123"
)
\
.
mode
(
"append"
)
\
.
save
()
elif
db_type
==
DatabaseTypeEnum
.
MYSQL
:
self
.
inputs
[
identifier
]
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.mysql.cj.jdbc.Driver"
)
\
.
option
(
"url"
,
mysql_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
"root"
)
\
.
option
(
"password"
,
"root"
)
\
.
mode
(
"append"
)
\
.
save
()
else
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
self
.
inputs
[
identifier
]
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando resultados. {e}"
)
Utils/SparkUtils.py
View file @
805f468e
from
typing
import
Dict
from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
col
,
udf
from
pyspark.sql.types
import
ArrayType
,
StringType
import
logging
from
Enum.InputTypeEnum
import
InputTypeEnum
logger
=
logging
.
getLogger
()
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
])
->
SparkSession
:
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
]
,
source_type
:
InputTypeEnum
)
->
SparkSession
:
session
=
None
try
:
jars
=
list
(
spark_jars
.
values
())
...
...
@@ -18,17 +18,20 @@ def createSession(name: str, spark_jars: Dict[str, str]) -> SparkSession:
.
appName
(
name
)
\
.
config
(
"spark.jars"
,
jars
)
\
.
config
(
"spark.jars.packages"
,
"graphframes:graphframes:0.8.3-spark3.4-s_2.12"
)
\
.
config
(
"spark.hadoop.fs.s3a.aws.credentials.provider"
,
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
)
\
.
config
(
"spark.executor.extraClassPath"
,
jars
)
\
.
config
(
"spark.driver.extraClassPath"
,
jars
)
\
.
config
(
"spark.starrocks.driver"
,
"com.starroks.jdbc.Driver"
)
\
.
config
(
"spark.sql.catalogImplementation"
,
"in-memory"
)
\
.
getOrCreate
()
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.endpoint"
,
"http://192.168.21.47:9000"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.connection.ssl.enabled"
,
"false"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.path.style.access"
,
"true"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.access.key"
,
"minioadmin"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.secret.key"
,
"minioadmin"
)
if
source_type
==
InputTypeEnum
.
LOCAL
:
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.endpoint"
,
"http://192.168.21.47:9000"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.connection.ssl.enabled"
,
"false"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.path.style.access"
,
"true"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.access.key"
,
"minioadmin"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.secret.key"
,
"minioadmin"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error creando sesion. {e}"
)
finally
:
...
...
@@ -70,5 +73,5 @@ def find_related_vertices(graph):
dfs
(
vertex_id
,
related_vertices
)
# Agregar los vértices relacionados al diccionario
related_vertices_dict
[
vertex_id
]
=
list
(
related_vertices
)
related_vertices_dict
[
vertex_id
]
.
remove
(
vertex_id
)
return
related_vertices_dict
commission.py
View file @
805f468e
...
...
@@ -5,6 +5,7 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.CommissionProcess
import
CommissionProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
SPARK_JARS
=
{
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
...
...
@@ -14,6 +15,10 @@ SPARK_JARS = {
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
DB_TYPE
=
DatabaseTypeEnum
.
REDSHIFT
@
flow
()
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +34,7 @@ def run_commission(config: Dict[str, Any]) -> None:
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader
=
time
.
time
()
commission_process
.
get_inputs
(
commission_process
,
STARROK_JDBC
,
STARROK_FE_NODE
)
commission_process
.
get_inputs
(
commission_process
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
)
logger
.
info
(
f
"Duración de extracción de datos desde la BD: {time.time() - start_reader}"
)
# Tercer task - Obtener metas
...
...
@@ -37,7 +42,7 @@ def run_commission(config: Dict[str, Any]) -> None:
goals
=
commission_process
.
get_goals
(
commission_process
,
"VENTAS"
,
"GOALS"
)
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes
=
commission_process
.
get_executed
(
commission_process
,
"VENTAS"
,
"
TEAM
S"
)
executes
=
commission_process
.
get_executed
(
commission_process
,
"VENTAS"
,
"
DEVICE
S"
)
# Sexo task - Obtener monto origen
base
=
commission_process
.
get_source_value
(
commission_process
,
"VENTAS"
,
"COMERCIAL_BASE"
)
...
...
commission_2.py
View file @
805f468e
...
...
@@ -5,6 +5,7 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.CommissionProcess
import
CommissionProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
SPARK_JARS
=
{
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
...
...
@@ -14,6 +15,12 @@ SPARK_JARS = {
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
@
flow
()
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +36,8 @@ def run_commission(config: Dict[str, Any]) -> None:
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader
=
time
.
time
()
commission_process
.
get_inputs
(
commission_process
,
STARROK_JDBC
,
STARROK_FE_NODE
)
commission_process
.
get_inputs
(
commission_process
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de extracción de datos desde la BD: {time.time() - start_reader}"
)
# Tercer task - Obtener metas
...
...
@@ -37,20 +45,15 @@ def run_commission(config: Dict[str, Any]) -> None:
goals
=
commission_process
.
get_goals_2
(
commission_process
,
"GOALS"
,
"ESTRUCTURA_ORGANIZACIONAL"
)
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes
=
commission_process
.
get_executed_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"TEAMS"
,
"VENTAS"
)
#
executes
=
commission_process
.
get_executed_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"DEVICES"
,
"VENTAS"
)
# Sexo task - Obtener monto origen
base
=
commission_process
.
get_source_value_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"COMERCIAL_BASE"
)
# Segundo task - Crear jerarquía
start_process
=
time
.
time
()
# ["AGENTES", "ESTRUCTURA", "UO", "OGRANIZACIONES"]
identifiers
=
[
"INDIVIDUOS"
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"UNIDAD"
,
"ORGANIZACION"
]
jerarquia_graph
=
commission_process
.
create_jerarquia
(
commission_process
,
identifiers
,
goals
,
executes
,
base
)
logger
.
info
(
f
"Duración de creación de dataframes con grafos (jerarquía): {time.time() - start_process}"
)
result
=
commission_process
.
update_executes
(
commission_process
,
jerarquia_graph
,
goals
,
executes
,
base
)
result
=
commission_process
.
get_commission_per_agent_2
(
commission_process
,
result
)
...
...
@@ -58,7 +61,8 @@ def run_commission(config: Dict[str, Any]) -> None:
# Task de escritura
start_load
=
time
.
time
()
_
=
commission_process
.
write_result
(
commission_process
,
result
,
"REPORT_SUMMARY"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
_
=
commission_process
.
write_result
(
commission_process
,
result
,
"REPORT_SUMMARY"
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga del reporte a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso de comision: {time.time() - start_time}"
)
...
...
config.json
View file @
805f468e
...
...
@@ -9,7 +9,7 @@
"data"
:
[
{
"identifier"
:
"VENTAS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/gross_202311.txt"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/gross_202311.txt"
,
"input_type"
:
"txt"
,
"separator"
:
"|"
,
"schema"
:
{
...
...
@@ -29,8 +29,8 @@
}
},
{
"identifier"
:
"
TEAM
S"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/equipos_202311.txt"
,
"identifier"
:
"
DEVICE
S"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/equipos_202311.txt"
,
"input_type"
:
"txt"
,
"separator"
:
"|"
,
"schema"
:
{
...
...
@@ -45,7 +45,7 @@
},
{
"identifier"
:
"GOALS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/metas_202311.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/metas_202311.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -58,7 +58,7 @@
},
{
"identifier"
:
"COMERCIAL_BASE"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/planta_comercial_202311.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/planta_comercial_202311.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -70,7 +70,7 @@
},
{
"identifier"
:
"INDIVIDUOS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/individuos_2023111813.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/individuos_2023111813.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -89,7 +89,7 @@
},
{
"identifier"
:
"ROLES"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/roles_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/roles_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -99,7 +99,7 @@
},
{
"identifier"
:
"ORGANIZACION"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/organizaciones_2023111813.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/organizaciones_2023111813.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -121,7 +121,7 @@
},
{
"identifier"
:
"UNIDAD"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/unidades_organizacionales_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/unidades_organizacionales_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -139,7 +139,7 @@
},
{
"identifier"
:
"ESTRUCTURA_ORGANIZACIONAL"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/estructura_organizacional_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/estructura_organizacional_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -154,5 +154,13 @@
"starroks"
:
{
"user"
:
"root"
,
"password"
:
""
},
"redshift"
:
{
"user"
:
"awsuser"
,
"password"
:
"Awsuser123"
},
"mysql"
:
{
"user"
:
"root"
,
"password"
:
"root"
}
}
\ No newline at end of file
config2.json
View file @
805f468e
...
...
@@ -8,7 +8,7 @@
"data"
:
[
{
"identifier"
:
"FACTURACION"
,
"path"
:
"s3a://prueba-id/bcom-tests/inputs/Facturacion_20240320.csv"
,
"path"
:
"s3a://prueba-id
2
/bcom-tests/inputs/Facturacion_20240320.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -24,7 +24,7 @@
},
{
"identifier"
:
"ENDING"
,
"path"
:
"s3a://prueba-id/bcom-tests/inputs/Ending_20240320.csv"
,
"path"
:
"s3a://prueba-id
2
/bcom-tests/inputs/Ending_20240320.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -42,5 +42,13 @@
"starroks"
:
{
"user"
:
"root"
,
"password"
:
""
},
"redshift"
:
{
"user"
:
"awsuser"
,
"password"
:
"Awsuser123"
},
"mysql"
:
{
"user"
:
"admin"
,
"password"
:
"awsuser123"
}
}
\ No newline at end of file
etl.py
View file @
805f468e
...
...
@@ -2,6 +2,8 @@ import time
import
json
from
typing
import
Any
,
Dict
from
prefect
import
flow
,
get_run_logger
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
from
Pipeline.ETLProcess
import
ETLProcess
...
...
@@ -12,12 +14,21 @@ SPARK_JARS = {
"COMMON"
:
"/opt/spark-jars/hadoop-common-3.3.4.jar"
,
"AWS_CLIENT"
:
"/opt/spark-jars/hadoop-client-3.3.4.jar"
,
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
,
"REDSHIFT"
:
"/opt/spark-jars/redshift-jdbc42-2.1.0.12.jar"
}
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
SOURCE_TYPE
=
InputTypeEnum
.
BUCKET
@
flow
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +40,7 @@ def run_etl(config: Dict[str, Any]) -> None:
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init
=
time
.
time
()
etl_process
.
init
(
SPARK_JARS
)
etl_process
.
init
(
SPARK_JARS
,
SOURCE_TYPE
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - (Reader) - Extraer los ficheros
...
...
@@ -42,31 +53,40 @@ def run_etl(config: Dict[str, Any]) -> None:
etl_process
.
set_schema
(
etl_process
)
# Process - Insumo Gross (Ventas)
ventas_flag
=
etl_process
.
process_gross
.
submit
(
etl_process
,
"VENTAS"
)
ventas_flag
=
etl_process
.
process_gross
(
etl_process
,
"VENTAS"
)
# Process - Insumo Team (Equipos)
teams_flag
=
etl_process
.
process_teams
.
submit
(
etl_process
,
"TEAM
S"
)
teams_flag
=
etl_process
.
process_teams
(
etl_process
,
"DEVICE
S"
)
logger
.
info
(
f
"Duración de transformación y limpieza de datos: {time.time() - start_transform}"
)
# Write - Insumo GROSS
start_load
=
time
.
time
()
etl_process
.
write
.
submit
(
etl_process
,
"VENTAS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
ventas_flag
)
# Write - Insumo TEAMS
etl_process
.
write
.
submit
(
etl_process
,
"TEAMS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_flag
)
etl_process
.
write
(
etl_process
,
"VENTAS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
ventas_flag
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo DEVICES
etl_process
.
write
(
etl_process
,
"DEVICES"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_flag
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo GOALS
etl_process
.
write
.
submit
(
etl_process
,
"GOALS"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"GOALS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo PLANTA
etl_process
.
write
.
submit
(
etl_process
,
"COMERCIAL_BASE"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"COMERCIAL_BASE"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo INDIVIDUOS
etl_process
.
write
.
submit
(
etl_process
,
"INDIVIDUOS"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"INDIVIDUOS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ROLES
etl_process
.
write
.
submit
(
etl_process
,
"ROLES"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ROLES"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ORGANIZACION
etl_process
.
write
.
submit
(
etl_process
,
"ORGANIZACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ORGANIZACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo UNIDADES
etl_process
.
write
.
submit
(
etl_process
,
"UNIDAD"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"UNIDAD"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ESTRUCTURA
etl_process
.
write
.
submit
(
etl_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga de datos a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL General: {time.time() - start_time}"
)
...
...
etl_2.py
View file @
805f468e
...
...
@@ -4,6 +4,8 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.ETLProcess
import
ETLProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
SPARK_JARS
=
{
...
...
@@ -12,12 +14,21 @@ SPARK_JARS = {
"COMMON"
:
"/opt/spark-jars/hadoop-common-3.3.4.jar"
,
"AWS_CLIENT"
:
"/opt/spark-jars/hadoop-client-3.3.4.jar"
,
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
,
"REDSHIFT"
:
"/opt/spark-jars/redshift-jdbc42-2.1.0.12.jar"
}
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
SOURCE_TYPE
=
InputTypeEnum
.
BUCKET
@
flow
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +40,7 @@ def run_etl(config: Dict[str, Any]) -> None:
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init
=
time
.
time
()
etl_process
.
init
(
SPARK_JARS
)
etl_process
.
init
(
SPARK_JARS
,
SOURCE_TYPE
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - (Reader) - Extraer los ficheros
...
...
@@ -46,10 +57,12 @@ def run_etl(config: Dict[str, Any]) -> None:
logger
.
info
(
f
"Duración de transformación y limpieza de datos: {time.time() - start_transform}"
)
start_load
=
time
.
time
()
# Write - Insumo TEAMS
etl_process
.
write
(
etl_process
,
"FACTURACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_fact
)
# Write - Insumo DEVICES
etl_process
.
write
(
etl_process
,
"FACTURACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_fact
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo GOALS
etl_process
.
write
(
etl_process
,
"ENDING"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ENDING"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga de datos a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL General: {time.time() - start_time}"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment