Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
BCOM-Components-Innovation-Tests
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Proyectos-Innovacion-2024
BCOM-Components-Innovation-Tests
Commits
805f468e
Commit
805f468e
authored
Apr 02, 2024
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix bug commission
parent
a78f4f94
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
242 additions
and
125 deletions
+242
-125
DatabaseTypeEnum.py
Enum/DatabaseTypeEnum.py
+8
-0
CommissionProcess.py
Pipeline/CommissionProcess.py
+84
-59
ETLProcess.py
Pipeline/ETLProcess.py
+39
-16
SparkUtils.py
Utils/SparkUtils.py
+12
-9
commission.py
commission.py
+7
-2
commission_2.py
commission_2.py
+12
-8
config.json
config.json
+18
-10
config2.json
config2.json
+10
-2
etl.py
etl.py
+34
-14
etl_2.py
etl_2.py
+18
-5
No files found.
Enum/DatabaseTypeEnum.py
0 → 100644
View file @
805f468e
from
enum
import
Enum
class
DatabaseTypeEnum
(
Enum
):
MONGODB
=
"mongodb"
MYSQL
=
"mysql"
REDSHIFT
=
"redshift"
STARROKS
=
"starroks"
Pipeline/CommissionProcess.py
View file @
805f468e
...
...
@@ -3,12 +3,11 @@ from typing import Dict, Any, List
from
prefect
import
task
import
time
from
pyspark.sql
import
DataFrame
from
pyspark.sql.functions
import
when
,
lit
,
sum
as
py_sum
,
count
,
coalesce
from
pyspark.sql.functions
import
when
,
lit
,
sum
as
py_sum
,
count
,
coalesce
,
expr
from
graphframes
import
GraphFrame
from
pyspark.sql
import
functions
as
F
from
pyspark.sql.window
import
Window
from
Utils.SparkUtils
import
createSession
,
find_related_vertices
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
logger
=
logging
.
getLogger
()
...
...
@@ -25,23 +24,48 @@ class CommissionProcess:
self
.
inter_results
=
{}
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
])
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
)
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
,
""
)
@
task
def
get_inputs
(
self
,
starroks_jdbc
:
str
,
starroks_fe
:
str
)
->
None
:
def
get_inputs
(
self
,
db_type
:
DatabaseTypeEnum
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
redshift_url
:
str
=
""
,
mysql_url
:
str
=
""
)
->
None
:
try
:
inputs
=
self
.
conf
[
"inputs"
]
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
for
input_obj
in
inputs
[
"data"
]:
identifier
=
input_obj
[
"identifier"
]
df_input
=
self
.
session
.
read
.
format
(
"starrocks"
)
.
\
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
.
\
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
.
\
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
.
\
option
(
"starrocks.user"
,
starroks_user
)
.
\
option
(
"starrocks.password"
,
starroks_pass
)
.
load
()
if
db_type
==
DatabaseTypeEnum
.
REDSHIFT
:
redshift_user
=
self
.
conf
[
"redshift"
][
"user"
]
redshift_pass
=
self
.
conf
[
"redshift"
][
"password"
]
df_input
=
self
.
session
.
read
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.amazon.redshift.jdbc42.Driver"
)
\
.
option
(
"url"
,
redshift_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
redshift_user
)
\
.
option
(
"password"
,
redshift_pass
)
\
.
load
()
elif
db_type
==
DatabaseTypeEnum
.
MYSQL
:
mysql_user
=
self
.
conf
[
"mysql"
][
"user"
]
mysql_pass
=
self
.
conf
[
"mysql"
][
"password"
]
df_input
=
self
.
session
.
read
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.mysql.cj.jdbc.Driver"
)
\
.
option
(
"url"
,
mysql_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
mysql_user
)
\
.
option
(
"password"
,
mysql_pass
)
\
.
load
()
else
:
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
df_input
=
self
.
session
.
read
.
format
(
"starrocks"
)
.
\
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
.
\
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
.
\
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
.
\
option
(
"starrocks.user"
,
starroks_user
)
.
\
option
(
"starrocks.password"
,
starroks_pass
)
.
load
()
self
.
inputs
.
update
({
identifier
:
df_input
})
except
Exception
as
e
:
raise
AssertionError
(
f
"Error in function 'get_inputs'. {e}"
)
...
...
@@ -247,45 +271,24 @@ class CommissionProcess:
try
:
start_process
=
time
.
time
()
graph_related
=
find_related_vertices
(
graph
)
graph_related
=
{
k
:
v
for
k
,
v
in
graph_related
.
items
()
if
len
(
v
)
!=
0
}
merged
=
df_goals
.
join
(
df_executes
,
"AGENTE_COMISIONA"
,
'left'
)
merged
=
merged
.
join
(
df_base
,
"AGENTE_COMISIONA"
,
'left'
)
# for vertex in graph_related.keys():
# value = merged.filter(merged["AGENTE_COMISIONA"].isin(graph_related[vertex])).agg(py_sum("EJECUTADO_1"),
# py_sum("EJECUTADO_2"),
# py_sum("EJECUTADO_3")).collect()[0]
# value_1, value_2, value_3 = value[0], value[1], value[2]
# merged = merged.withColumn("EJECUTADO_1", when(merged["AGENTE_COMISIONA"] == vertex, value_1).otherwise(merged["EJECUTADO_1"]))
# merged = merged.withColumn("EJECUTADO_2", when(merged["AGENTE_COMISIONA"] == vertex, value_2).otherwise(merged["EJECUTADO_2"]))
# merged = merged.withColumn("EJECUTADO_3", when(merged["AGENTE_COMISIONA"] == vertex, value_3).otherwise(merged["EJECUTADO_3"]))
graph_related_df
=
self
.
session
.
createDataFrame
([(
k
,
v
)
for
k
,
v
in
graph_related
.
items
()],
[
"vertex"
,
"related_agents"
])
graph_related_df
=
self
.
session
.
createDataFrame
([(
k
,
v
)
for
k
,
v
in
graph_related
.
items
()],
[
"vertex"
,
"related_agents"
])
# Agregar los valores ejecutados por cada agente
agg_df
=
merged
.
join
(
graph_related_df
,
F
.
expr
(
"array_contains(related_agents, AGENTE_COMISIONA)"
),
"inner"
)
\
agg_df
=
merged
.
join
(
graph_related_df
,
expr
(
"array_contains(related_agents, AGENTE_COMISIONA)"
),
"inner"
)
\
.
groupBy
(
"vertex"
)
\
.
agg
(
F
.
sum
(
"EJECUTADO_1"
)
.
alias
(
"sum_EJECUTADO_1"
),
F
.
sum
(
"EJECUTADO_2"
)
.
alias
(
"sum_EJECUTADO_2"
),
F
.
sum
(
"EJECUTADO_3"
)
.
alias
(
"sum_EJECUTADO_3"
))
# Crear ventanas para particionar por "AGENTE_COMISIONA" y ordenar por alguna columna (aquí suponemos "order_column")
windowSpec1
=
Window
.
partitionBy
(
"AGENTE_COMISIONA"
)
.
orderBy
(
"AGENTE_COMISIONA"
)
# Utilizar la función lag() para obtener el valor ejecutado anteriormente por cada agente
for
i
in
range
(
1
,
4
):
merged
=
merged
.
withColumn
(
f
"EJECUTADO_{i}"
,
F
.
coalesce
(
F
.
lag
(
f
"EJECUTADO_{i}"
,
1
)
.
over
(
windowSpec1
),
F
.
lit
(
0
)))
# Realizar una join con agg_df para actualizar los valores ejecutados
for
i
in
range
(
1
,
4
):
merged
=
merged
.
join
(
agg_df
,
merged
[
"AGENTE_COMISIONA"
]
==
agg_df
[
"vertex"
],
"left"
)
\
.
withColumn
(
f
"EJECUTADO_{i}"
,
F
.
when
(
F
.
col
(
"vertex"
)
.
isNull
(),
merged
[
f
"EJECUTADO_{i}"
])
.
otherwise
(
F
.
col
(
f
"sum_EJECUTADO_{i}"
)))
\
.
drop
(
"vertex"
,
"related_agents"
,
f
"sum_EJECUTADO_{i}"
)
merged
.
show
()
.
agg
(
py_sum
(
"EJECUTADO_1"
)
.
alias
(
"sum_EJECUTADO_1"
),
py_sum
(
"EJECUTADO_2"
)
.
alias
(
"sum_EJECUTADO_2"
),
py_sum
(
"EJECUTADO_3"
)
.
alias
(
"sum_EJECUTADO_3"
))
merged
=
merged
.
join
(
agg_df
,
merged
[
"AGENTE_COMISIONA"
]
==
agg_df
[
"vertex"
],
'left'
)
merged
=
merged
.
withColumn
(
"EJECUTADO_1"
,
when
(
merged
[
"vertex"
]
.
isNotNull
(),
merged
[
"sum_EJECUTADO_1"
])
.
otherwise
(
merged
[
"EJECUTADO_1"
]))
merged
=
merged
.
withColumn
(
"EJECUTADO_2"
,
when
(
merged
[
"vertex"
]
.
isNotNull
(),
merged
[
"sum_EJECUTADO_2"
])
.
otherwise
(
merged
[
"EJECUTADO_2"
]))
merged
=
merged
.
withColumn
(
"EJECUTADO_3"
,
when
(
merged
[
"vertex"
]
.
isNotNull
(),
merged
[
"sum_EJECUTADO_3"
])
.
otherwise
(
merged
[
"EJECUTADO_3"
]))
merged
=
merged
.
drop
(
"vertex"
,
"sum_EJECUTADO_1"
,
"sum_EJECUTADO_2"
,
"sum_EJECUTADO_3"
)
print
(
f
"Duración de creación de dataframes con grafos (jerarquía): {time.time() - start_process}"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error actualizando ejecutados por jerarquía. {e}"
)
...
...
@@ -405,19 +408,41 @@ class CommissionProcess:
return
merged
@
task
def
write_result
(
self
,
df
:
DataFrame
,
collection_name
:
str
,
starroks_jdbc
:
str
,
starroks_fe
:
str
)
->
None
:
def
write_result
(
self
,
df
:
DataFrame
,
table_name
:
str
,
db_type
:
DatabaseTypeEnum
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
redshift_url
:
str
=
""
,
mysql_url
:
str
=
""
)
->
None
:
try
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
df
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
collection_name
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
if
db_type
==
DatabaseTypeEnum
.
REDSHIFT
:
df
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.amazon.redshift.jdbc42.Driver"
)
\
.
option
(
"url"
,
redshift_url
)
\
.
option
(
"dbtable"
,
table_name
)
\
.
option
(
"user"
,
"awsuser"
)
\
.
option
(
"password"
,
"Awsuser123"
)
\
.
mode
(
"append"
)
\
.
save
()
elif
db_type
==
DatabaseTypeEnum
.
MYSQL
:
df
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.mysql.cj.jdbc.Driver"
)
\
.
option
(
"url"
,
mysql_url
)
\
.
option
(
"dbtable"
,
table_name
)
\
.
option
(
"user"
,
"root"
)
\
.
option
(
"password"
,
"root"
)
\
.
mode
(
"overwrite"
)
\
.
save
()
else
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
df
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
table_name
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando datos en BD. {e}"
)
Pipeline/ETLProcess.py
View file @
805f468e
from
typing
import
Dict
,
Any
import
logging
from
pyspark.sql.functions
import
col
,
when
,
lit
,
to_date
,
date_format
,
date_add
from
pyspark.sql.types
import
StructType
,
StructField
,
StringType
from
prefect
import
task
from
Enum.DataTypeEnum
import
DataTypeEnum
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
from
Utils.SparkUtils
import
createSession
from
Input.Source
import
Input
...
...
@@ -20,8 +21,8 @@ class ETLProcess:
self
.
inputs
=
{}
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
])
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
)
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
]
,
source_type
:
InputTypeEnum
=
InputTypeEnum
.
BUCKET
)
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
,
source_type
)
@
task
def
reader
(
self
)
->
None
:
...
...
@@ -124,19 +125,41 @@ class ETLProcess:
return
success
@
task
def
write
(
self
,
identifier
:
str
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
prev_status
:
bool
=
True
)
->
None
:
def
write
(
self
,
identifier
:
str
,
starroks_jdbc
:
str
,
starroks_fe
:
str
,
prev_status
:
bool
=
True
,
db_type
:
DatabaseTypeEnum
=
DatabaseTypeEnum
.
REDSHIFT
,
redshift_url
:
str
=
""
,
mysql_url
:
str
=
""
)
->
None
:
try
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
self
.
inputs
[
identifier
]
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
if
db_type
==
DatabaseTypeEnum
.
REDSHIFT
:
self
.
inputs
[
identifier
]
.
coalesce
(
45
)
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.amazon.redshift.jdbc42.Driver"
)
\
.
option
(
"url"
,
redshift_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
"awsuser"
)
\
.
option
(
"password"
,
"Awsuser123"
)
\
.
mode
(
"append"
)
\
.
save
()
elif
db_type
==
DatabaseTypeEnum
.
MYSQL
:
self
.
inputs
[
identifier
]
.
write
\
.
format
(
"jdbc"
)
\
.
option
(
"driver"
,
"com.mysql.cj.jdbc.Driver"
)
\
.
option
(
"url"
,
mysql_url
)
\
.
option
(
"dbtable"
,
identifier
)
\
.
option
(
"user"
,
"root"
)
\
.
option
(
"password"
,
"root"
)
\
.
mode
(
"append"
)
\
.
save
()
else
:
database
=
starroks_jdbc
[
starroks_jdbc
.
rfind
(
"/"
)
+
1
:]
starroks_user
=
self
.
conf
[
"starroks"
][
"user"
]
starroks_pass
=
self
.
conf
[
"starroks"
][
"password"
]
self
.
inputs
[
identifier
]
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
starroks_fe
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
starroks_jdbc
)
\
.
option
(
"starrocks.table.identifier"
,
database
+
"."
+
identifier
)
\
.
option
(
"starrocks.user"
,
starroks_user
)
\
.
option
(
"starrocks.password"
,
starroks_pass
)
\
.
mode
(
"append"
)
\
.
save
()
except
Exception
as
e
:
logger
.
error
(
f
"Error guardando resultados. {e}"
)
Utils/SparkUtils.py
View file @
805f468e
from
typing
import
Dict
from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
col
,
udf
from
pyspark.sql.types
import
ArrayType
,
StringType
import
logging
from
Enum.InputTypeEnum
import
InputTypeEnum
logger
=
logging
.
getLogger
()
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
])
->
SparkSession
:
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
]
,
source_type
:
InputTypeEnum
)
->
SparkSession
:
session
=
None
try
:
jars
=
list
(
spark_jars
.
values
())
...
...
@@ -18,17 +18,20 @@ def createSession(name: str, spark_jars: Dict[str, str]) -> SparkSession:
.
appName
(
name
)
\
.
config
(
"spark.jars"
,
jars
)
\
.
config
(
"spark.jars.packages"
,
"graphframes:graphframes:0.8.3-spark3.4-s_2.12"
)
\
.
config
(
"spark.hadoop.fs.s3a.aws.credentials.provider"
,
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
)
\
.
config
(
"spark.executor.extraClassPath"
,
jars
)
\
.
config
(
"spark.driver.extraClassPath"
,
jars
)
\
.
config
(
"spark.starrocks.driver"
,
"com.starroks.jdbc.Driver"
)
\
.
config
(
"spark.sql.catalogImplementation"
,
"in-memory"
)
\
.
getOrCreate
()
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.endpoint"
,
"http://192.168.21.47:9000"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.connection.ssl.enabled"
,
"false"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.path.style.access"
,
"true"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.access.key"
,
"minioadmin"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.secret.key"
,
"minioadmin"
)
if
source_type
==
InputTypeEnum
.
LOCAL
:
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.endpoint"
,
"http://192.168.21.47:9000"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.connection.ssl.enabled"
,
"false"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.path.style.access"
,
"true"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.access.key"
,
"minioadmin"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.secret.key"
,
"minioadmin"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error creando sesion. {e}"
)
finally
:
...
...
@@ -70,5 +73,5 @@ def find_related_vertices(graph):
dfs
(
vertex_id
,
related_vertices
)
# Agregar los vértices relacionados al diccionario
related_vertices_dict
[
vertex_id
]
=
list
(
related_vertices
)
related_vertices_dict
[
vertex_id
]
.
remove
(
vertex_id
)
return
related_vertices_dict
commission.py
View file @
805f468e
...
...
@@ -5,6 +5,7 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.CommissionProcess
import
CommissionProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
SPARK_JARS
=
{
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
...
...
@@ -14,6 +15,10 @@ SPARK_JARS = {
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
DB_TYPE
=
DatabaseTypeEnum
.
REDSHIFT
@
flow
()
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +34,7 @@ def run_commission(config: Dict[str, Any]) -> None:
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader
=
time
.
time
()
commission_process
.
get_inputs
(
commission_process
,
STARROK_JDBC
,
STARROK_FE_NODE
)
commission_process
.
get_inputs
(
commission_process
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
)
logger
.
info
(
f
"Duración de extracción de datos desde la BD: {time.time() - start_reader}"
)
# Tercer task - Obtener metas
...
...
@@ -37,7 +42,7 @@ def run_commission(config: Dict[str, Any]) -> None:
goals
=
commission_process
.
get_goals
(
commission_process
,
"VENTAS"
,
"GOALS"
)
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes
=
commission_process
.
get_executed
(
commission_process
,
"VENTAS"
,
"
TEAM
S"
)
executes
=
commission_process
.
get_executed
(
commission_process
,
"VENTAS"
,
"
DEVICE
S"
)
# Sexo task - Obtener monto origen
base
=
commission_process
.
get_source_value
(
commission_process
,
"VENTAS"
,
"COMERCIAL_BASE"
)
...
...
commission_2.py
View file @
805f468e
...
...
@@ -5,6 +5,7 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.CommissionProcess
import
CommissionProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
SPARK_JARS
=
{
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
...
...
@@ -14,6 +15,12 @@ SPARK_JARS = {
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
@
flow
()
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +36,8 @@ def run_commission(config: Dict[str, Any]) -> None:
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader
=
time
.
time
()
commission_process
.
get_inputs
(
commission_process
,
STARROK_JDBC
,
STARROK_FE_NODE
)
commission_process
.
get_inputs
(
commission_process
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de extracción de datos desde la BD: {time.time() - start_reader}"
)
# Tercer task - Obtener metas
...
...
@@ -37,20 +45,15 @@ def run_commission(config: Dict[str, Any]) -> None:
goals
=
commission_process
.
get_goals_2
(
commission_process
,
"GOALS"
,
"ESTRUCTURA_ORGANIZACIONAL"
)
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes
=
commission_process
.
get_executed_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"TEAMS"
,
"VENTAS"
)
#
executes
=
commission_process
.
get_executed_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"DEVICES"
,
"VENTAS"
)
# Sexo task - Obtener monto origen
base
=
commission_process
.
get_source_value_2
(
commission_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"COMERCIAL_BASE"
)
# Segundo task - Crear jerarquía
start_process
=
time
.
time
()
# ["AGENTES", "ESTRUCTURA", "UO", "OGRANIZACIONES"]
identifiers
=
[
"INDIVIDUOS"
,
"ESTRUCTURA_ORGANIZACIONAL"
,
"UNIDAD"
,
"ORGANIZACION"
]
jerarquia_graph
=
commission_process
.
create_jerarquia
(
commission_process
,
identifiers
,
goals
,
executes
,
base
)
logger
.
info
(
f
"Duración de creación de dataframes con grafos (jerarquía): {time.time() - start_process}"
)
result
=
commission_process
.
update_executes
(
commission_process
,
jerarquia_graph
,
goals
,
executes
,
base
)
result
=
commission_process
.
get_commission_per_agent_2
(
commission_process
,
result
)
...
...
@@ -58,7 +61,8 @@ def run_commission(config: Dict[str, Any]) -> None:
# Task de escritura
start_load
=
time
.
time
()
_
=
commission_process
.
write_result
(
commission_process
,
result
,
"REPORT_SUMMARY"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
_
=
commission_process
.
write_result
(
commission_process
,
result
,
"REPORT_SUMMARY"
,
DB_TYPE
,
STARROK_JDBC
,
STARROK_FE_NODE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga del reporte a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso de comision: {time.time() - start_time}"
)
...
...
config.json
View file @
805f468e
...
...
@@ -9,7 +9,7 @@
"data"
:
[
{
"identifier"
:
"VENTAS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/gross_202311.txt"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/gross_202311.txt"
,
"input_type"
:
"txt"
,
"separator"
:
"|"
,
"schema"
:
{
...
...
@@ -29,8 +29,8 @@
}
},
{
"identifier"
:
"
TEAM
S"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/equipos_202311.txt"
,
"identifier"
:
"
DEVICE
S"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/equipos_202311.txt"
,
"input_type"
:
"txt"
,
"separator"
:
"|"
,
"schema"
:
{
...
...
@@ -45,7 +45,7 @@
},
{
"identifier"
:
"GOALS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/metas_202311.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/metas_202311.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -58,7 +58,7 @@
},
{
"identifier"
:
"COMERCIAL_BASE"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/planta_comercial_202311.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/planta_comercial_202311.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -70,7 +70,7 @@
},
{
"identifier"
:
"INDIVIDUOS"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/individuos_2023111813.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/individuos_2023111813.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -89,7 +89,7 @@
},
{
"identifier"
:
"ROLES"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/roles_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/roles_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -99,7 +99,7 @@
},
{
"identifier"
:
"ORGANIZACION"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/organizaciones_2023111813.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/organizaciones_2023111813.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -121,7 +121,7 @@
},
{
"identifier"
:
"UNIDAD"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/unidades_organizacionales_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/unidades_organizacionales_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -139,7 +139,7 @@
},
{
"identifier"
:
"ESTRUCTURA_ORGANIZACIONAL"
,
"path"
:
"s3a://prueba-id
/inputs_spark
/estructura_organizacional_2023111812.csv"
,
"path"
:
"s3a://prueba-id
2/bcom-tests/inputs
/estructura_organizacional_2023111812.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -154,5 +154,13 @@
"starroks"
:
{
"user"
:
"root"
,
"password"
:
""
},
"redshift"
:
{
"user"
:
"awsuser"
,
"password"
:
"Awsuser123"
},
"mysql"
:
{
"user"
:
"root"
,
"password"
:
"root"
}
}
\ No newline at end of file
config2.json
View file @
805f468e
...
...
@@ -8,7 +8,7 @@
"data"
:
[
{
"identifier"
:
"FACTURACION"
,
"path"
:
"s3a://prueba-id/bcom-tests/inputs/Facturacion_20240320.csv"
,
"path"
:
"s3a://prueba-id
2
/bcom-tests/inputs/Facturacion_20240320.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -24,7 +24,7 @@
},
{
"identifier"
:
"ENDING"
,
"path"
:
"s3a://prueba-id/bcom-tests/inputs/Ending_20240320.csv"
,
"path"
:
"s3a://prueba-id
2
/bcom-tests/inputs/Ending_20240320.csv"
,
"input_type"
:
"csv"
,
"separator"
:
";"
,
"schema"
:
{
...
...
@@ -42,5 +42,13 @@
"starroks"
:
{
"user"
:
"root"
,
"password"
:
""
},
"redshift"
:
{
"user"
:
"awsuser"
,
"password"
:
"Awsuser123"
},
"mysql"
:
{
"user"
:
"admin"
,
"password"
:
"awsuser123"
}
}
\ No newline at end of file
etl.py
View file @
805f468e
...
...
@@ -2,6 +2,8 @@ import time
import
json
from
typing
import
Any
,
Dict
from
prefect
import
flow
,
get_run_logger
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
from
Pipeline.ETLProcess
import
ETLProcess
...
...
@@ -12,12 +14,21 @@ SPARK_JARS = {
"COMMON"
:
"/opt/spark-jars/hadoop-common-3.3.4.jar"
,
"AWS_CLIENT"
:
"/opt/spark-jars/hadoop-client-3.3.4.jar"
,
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
,
"REDSHIFT"
:
"/opt/spark-jars/redshift-jdbc42-2.1.0.12.jar"
}
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
SOURCE_TYPE
=
InputTypeEnum
.
BUCKET
@
flow
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +40,7 @@ def run_etl(config: Dict[str, Any]) -> None:
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init
=
time
.
time
()
etl_process
.
init
(
SPARK_JARS
)
etl_process
.
init
(
SPARK_JARS
,
SOURCE_TYPE
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - (Reader) - Extraer los ficheros
...
...
@@ -42,31 +53,40 @@ def run_etl(config: Dict[str, Any]) -> None:
etl_process
.
set_schema
(
etl_process
)
# Process - Insumo Gross (Ventas)
ventas_flag
=
etl_process
.
process_gross
.
submit
(
etl_process
,
"VENTAS"
)
ventas_flag
=
etl_process
.
process_gross
(
etl_process
,
"VENTAS"
)
# Process - Insumo Team (Equipos)
teams_flag
=
etl_process
.
process_teams
.
submit
(
etl_process
,
"TEAM
S"
)
teams_flag
=
etl_process
.
process_teams
(
etl_process
,
"DEVICE
S"
)
logger
.
info
(
f
"Duración de transformación y limpieza de datos: {time.time() - start_transform}"
)
# Write - Insumo GROSS
start_load
=
time
.
time
()
etl_process
.
write
.
submit
(
etl_process
,
"VENTAS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
ventas_flag
)
# Write - Insumo TEAMS
etl_process
.
write
.
submit
(
etl_process
,
"TEAMS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_flag
)
etl_process
.
write
(
etl_process
,
"VENTAS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
ventas_flag
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo DEVICES
etl_process
.
write
(
etl_process
,
"DEVICES"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_flag
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo GOALS
etl_process
.
write
.
submit
(
etl_process
,
"GOALS"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"GOALS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo PLANTA
etl_process
.
write
.
submit
(
etl_process
,
"COMERCIAL_BASE"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"COMERCIAL_BASE"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo INDIVIDUOS
etl_process
.
write
.
submit
(
etl_process
,
"INDIVIDUOS"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"INDIVIDUOS"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ROLES
etl_process
.
write
.
submit
(
etl_process
,
"ROLES"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ROLES"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ORGANIZACION
etl_process
.
write
.
submit
(
etl_process
,
"ORGANIZACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ORGANIZACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo UNIDADES
etl_process
.
write
.
submit
(
etl_process
,
"UNIDAD"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"UNIDAD"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
# Write - Insumo ESTRUCTURA
etl_process
.
write
.
submit
(
etl_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ESTRUCTURA_ORGANIZACIONAL"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga de datos a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL General: {time.time() - start_time}"
)
...
...
etl_2.py
View file @
805f468e
...
...
@@ -4,6 +4,8 @@ from typing import Any, Dict
from
prefect
import
flow
,
get_run_logger
from
Pipeline.ETLProcess
import
ETLProcess
from
Enum.DatabaseTypeEnum
import
DatabaseTypeEnum
from
Enum.InputTypeEnum
import
InputTypeEnum
SPARK_JARS
=
{
...
...
@@ -12,12 +14,21 @@ SPARK_JARS = {
"COMMON"
:
"/opt/spark-jars/hadoop-common-3.3.4.jar"
,
"AWS_CLIENT"
:
"/opt/spark-jars/hadoop-client-3.3.4.jar"
,
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.2_2.12-1.1.2.jar"
,
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
,
"REDSHIFT"
:
"/opt/spark-jars/redshift-jdbc42-2.1.0.12.jar"
}
STARROK_JDBC
=
"jdbc:mysql://192.168.1.37:9030/bcom_spark"
STARROK_FE_NODE
=
"192.168.1.37:8030"
REDSHIFT_JDBC
=
"jdbc:redshift://redshift-cluster-1.cumpswji5bs3.us-east-1.redshift.amazonaws.com:5439/dev?currentSchema=prueba_ca"
MYSQL_JDBC
=
"jdbc:mysql://localhost:13306/bcom_spark"
DB_TYPE
=
DatabaseTypeEnum
.
MYSQL
SOURCE_TYPE
=
InputTypeEnum
.
BUCKET
@
flow
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
...
...
@@ -29,7 +40,7 @@ def run_etl(config: Dict[str, Any]) -> None:
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init
=
time
.
time
()
etl_process
.
init
(
SPARK_JARS
)
etl_process
.
init
(
SPARK_JARS
,
SOURCE_TYPE
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - (Reader) - Extraer los ficheros
...
...
@@ -46,10 +57,12 @@ def run_etl(config: Dict[str, Any]) -> None:
logger
.
info
(
f
"Duración de transformación y limpieza de datos: {time.time() - start_transform}"
)
start_load
=
time
.
time
()
# Write - Insumo TEAMS
etl_process
.
write
(
etl_process
,
"FACTURACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_fact
)
# Write - Insumo DEVICES
etl_process
.
write
(
etl_process
,
"FACTURACION"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
teams_fact
,
DB_TYPE
,
REDSHIFT_JDBC
,
MYSQL_JDBC
)
# Write - Insumo GOALS
etl_process
.
write
(
etl_process
,
"ENDING"
,
STARROK_JDBC
,
STARROK_FE_NODE
)
etl_process
.
write
(
etl_process
,
"ENDING"
,
STARROK_JDBC
,
STARROK_FE_NODE
,
db_type
=
DB_TYPE
,
redshift_url
=
REDSHIFT_JDBC
,
mysql_url
=
MYSQL_JDBC
)
logger
.
info
(
f
"Duración de carga de datos a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL General: {time.time() - start_time}"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment