Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
BCOM-Components-Innovation-Tests
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Proyectos-Innovacion-2024
BCOM-Components-Innovation-Tests
Commits
9c72a0a6
Commit
9c72a0a6
authored
Mar 25, 2024
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add starroks.py
parent
15b44e0a
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
318 additions
and
53 deletions
+318
-53
CommissionProcess.py
Pipeline/CommissionProcess.py
+208
-0
ETLProcess.py
Pipeline/ETLProcess.py
+13
-6
Process.py
Pipeline/Process.py
+0
-33
SparkUtils.py
Utils/SparkUtils.py
+33
-3
commission.py
commission.py
+42
-4
config.json
config.json
+2
-0
etl.py
etl.py
+20
-7
No files found.
Pipeline/CommissionProcess.py
0 → 100644
View file @
9c72a0a6
This diff is collapsed.
Click to expand it.
Pipeline/ETLProcess.py
View file @
9c72a0a6
...
@@ -19,8 +19,8 @@ class ETLProcess:
...
@@ -19,8 +19,8 @@ class ETLProcess:
self
.
inputs
=
{}
self
.
inputs
=
{}
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
],
mongodb_uri
:
str
=
""
)
->
None
:
def
init
(
self
,
spark_jars
:
Dict
[
str
,
str
],
mongodb_uri
:
str
=
""
,
starrok_uri
:
str
=
""
)
->
None
:
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
,
mongodb_uri
)
self
.
session
=
createSession
(
self
.
identifier
,
spark_jars
,
mongodb_uri
,
starrok_uri
)
@
task
@
task
def
reader
(
self
)
->
None
:
def
reader
(
self
)
->
None
:
...
@@ -79,7 +79,7 @@ class ETLProcess:
...
@@ -79,7 +79,7 @@ class ETLProcess:
self
.
inputs
[
identifier
]
=
self
.
inputs
[
identifier
]
.
withColumn
(
"TIPO_CANAL"
,
lit
(
"DIRECT"
))
self
.
inputs
[
identifier
]
=
self
.
inputs
[
identifier
]
.
withColumn
(
"TIPO_CANAL"
,
lit
(
"DIRECT"
))
success
=
True
success
=
True
except
Exception
as
e
:
except
Exception
as
e
:
raise
AssertionE
rror
(
f
"Error transformando archivo gross. {e}"
)
logger
.
e
rror
(
f
"Error transformando archivo gross. {e}"
)
finally
:
finally
:
return
success
return
success
...
@@ -100,8 +100,15 @@ class ETLProcess:
...
@@ -100,8 +100,15 @@ class ETLProcess:
@
task
@
task
def
write
(
self
,
identifier
:
str
,
prev_status
:
bool
=
True
)
->
None
:
def
write
(
self
,
identifier
:
str
,
prev_status
:
bool
=
True
)
->
None
:
try
:
try
:
self
.
inputs
[
identifier
]
.
printSchema
()
# self.inputs[identifier].write.format("starrocks"). \
self
.
inputs
[
identifier
]
.
write
.
format
(
"com.mongodb.spark.sql.DefaultSource"
)
.
\
# option("dbtable", identifier).mode("overwrite").save()
option
(
"collection"
,
identifier
)
.
mode
(
"append"
)
.
save
()
self
.
inputs
[
identifier
]
.
write
.
format
(
"starrocks"
)
\
.
option
(
"starrocks.fe.http.url"
,
"ec2-34-231-243-52.compute-1.amazonaws.com:8030"
)
\
.
option
(
"starrocks.fe.jdbc.url"
,
"jdbc:mysql://ec2-34-231-243-52.compute-1.amazonaws.com:9030/bcom_spark"
)
\
.
option
(
"starrocks.table.identifier"
,
"bcom_spark."
+
identifier
)
\
.
option
(
"starrocks.user"
,
"root"
)
\
.
option
(
"starrocks.password"
,
""
)
\
.
mode
(
"append"
)
\
.
save
()
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Erro guardando resultados. {e}"
)
logger
.
error
(
f
"Erro guardando resultados. {e}"
)
Pipeline/Process.py
deleted
100644 → 0
View file @
15b44e0a
import
logging
from
typing
import
Dict
,
Any
from
pyspark.sql
import
SparkSession
,
DataFrame
from
prefect
import
flow
,
task
from
Input.Source
import
Input
logger
=
logging
.
getLogger
()
class
Process
:
def
__init__
(
self
,
config
:
Dict
[
str
,
Any
])
->
None
:
self
.
conf
=
config
self
.
identifier
=
self
.
conf
[
"identifier"
]
self
.
session
=
None
self
.
inputs
=
{}
def
init
(
self
)
->
None
:
self
.
_createSession
()
def
get_inputs
(
self
)
->
None
:
try
:
pass
except
Exception
as
e
:
raise
AssertionError
(
f
"Error in function 'get_inputs'. {e}"
)
def
run
(
self
)
->
None
:
# Get inputs
self
.
get_inputs
()
Utils/SparkUtils.py
View file @
9c72a0a6
from
typing
import
Dict
from
typing
import
Dict
from
pyspark.sql
import
SparkSession
from
pyspark.sql
import
SparkSession
,
DataFrame
import
logging
import
logging
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
],
mongodb_uri
:
str
=
""
)
->
SparkSession
:
def
createSession
(
name
:
str
,
spark_jars
:
Dict
[
str
,
str
],
mongodb_uri
:
str
,
starrok_uri
:
str
)
->
SparkSession
:
session
=
None
session
=
None
try
:
try
:
jars
=
list
(
spark_jars
.
values
())
jars
=
list
(
spark_jars
.
values
())
jars
=
","
.
join
(
jars
)
jars
=
","
.
join
(
jars
)
print
(
jars
)
session
=
SparkSession
.
builder
\
session
=
SparkSession
.
builder
\
.
appName
(
name
)
\
.
appName
(
name
)
\
.
config
(
"spark.hadoop.fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
\
.
config
(
"spark.hadoop.fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
\
...
@@ -20,10 +19,41 @@ def createSession(name: str, spark_jars: Dict[str, str], mongodb_uri: str = "")
...
@@ -20,10 +19,41 @@ def createSession(name: str, spark_jars: Dict[str, str], mongodb_uri: str = "")
.
config
(
"spark.jars"
,
jars
)
\
.
config
(
"spark.jars"
,
jars
)
\
.
config
(
"spark.executor.extraClassPath"
,
jars
)
\
.
config
(
"spark.executor.extraClassPath"
,
jars
)
\
.
config
(
"spark.driver.extraClassPath"
,
jars
)
\
.
config
(
"spark.driver.extraClassPath"
,
jars
)
\
.
config
(
"spark.mongodb.input.uri"
,
mongodb_uri
)
\
.
config
(
"spark.mongodb.output.uri"
,
mongodb_uri
)
\
.
config
(
"spark.mongodb.output.uri"
,
mongodb_uri
)
\
.
getOrCreate
()
.
getOrCreate
()
# .config("spark.starrocks.url", starrok_uri) \
# .config("spark.starrocks.driver", "com.starroks.jdbc.Driver") \
# .config("spark.sql.catalogImplementation", "in-memory") \
# .getOrCreate()
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
session
.
_jsc
.
hadoopConfiguration
()
.
set
(
"fs.s3a.impl"
,
"org.apache.hadoop.fs.s3a.S3AFileSystem"
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error creando sesion. {e}"
)
logger
.
error
(
f
"Error creando sesion. {e}"
)
finally
:
finally
:
return
session
return
session
def
get_goal_by_kpi
(
df
:
DataFrame
,
agent
:
str
,
period
:
str
,
kpi
:
str
)
->
float
:
result
=
0.0
try
:
df
=
df
.
filter
((
df
[
"CEDULA"
]
==
agent
)
&
(
df
[
"PERIODO_PROCESO_CODIGO"
]
==
period
)
&
(
df
[
"KPI"
]
==
kpi
))
.
\
select
(
"META_FINAL"
)
if
df
.
count
()
!=
0
:
results
=
[
row
[
0
]
for
row
in
df
.
select
(
"META_FINAL"
)
.
collect
()]
result
=
results
[
0
]
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo meta por kpi. {e}"
)
finally
:
return
result
def
get_execute_by_service
(
df
:
DataFrame
,
agent
:
str
,
period
:
str
,
segment
:
str
)
->
int
:
result
=
0
try
:
df
=
df
.
filter
((
df
[
"AGENTE_COMISIONA"
]
==
agent
)
&
(
df
[
"PERIODO_PROCESO_CODIGO"
]
==
period
)
&
(
df
[
"SEGMENTO"
]
==
segment
))
result
=
df
.
count
()
except
Exception
as
e
:
logger
.
error
(
f
"Error obteniendo meta por segmento. {e}"
)
finally
:
return
result
commission.py
View file @
9c72a0a6
import
time
import
time
import
json
import
json
import
logging
from
typing
import
Any
,
Dict
from
typing
import
Any
,
Dict
from
prefect
import
flow
from
prefect
import
flow
,
get_run_logger
from
Pipeline.CommissionProcess
import
CommissionProcess
logger
=
logging
.
getLogger
()
SPARK_JARS
=
{
"MONGO_CORE"
:
"/opt/spark-jars/mongodb-driver-core-4.0.4.jar"
,
"MONGO_CLIENT"
:
"/opt/spark-jars/mongodb-driver-sync-4.0.4.jar"
,
"MONGODB"
:
"/opt/spark-jars/mongo-spark-connector_2.12-3.0.1.jar"
,
"BSON"
:
"/opt/spark-jars/bson-4.0.4.jar"
}
MONGODB_URI
=
"mongodb://bcom_spark_user:root@192.168.1.37:50001/bcom_spark"
@
flow
()
@
flow
()
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
def
run_commission
(
config
:
Dict
[
str
,
Any
])
->
None
:
logger
=
get_run_logger
()
start_time
=
time
.
time
()
start_time
=
time
.
time
()
logger
.
info
(
f
"Duración de ejecución del proceso de liquidación: {start_time - time.time()}"
)
commission_process
=
CommissionProcess
(
config
)
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init
=
time
.
time
()
commission_process
.
init
(
SPARK_JARS
,
MONGODB_URI
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader
=
time
.
time
()
commission_process
.
get_inputs
(
commission_process
)
logger
.
info
(
f
"Duración de extracción de datos desde la BD: {time.time() - start_reader}"
)
# Tercer task - Obtener metas
start_process
=
time
.
time
()
goals
=
commission_process
.
get_goals_second_way
(
commission_process
,
"VENTAS"
,
"GOALS"
)
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes
=
commission_process
.
get_executed_second_way
(
commission_process
,
"VENTAS"
,
"TEAMS"
)
# Sexo task - Obtener monto origen
base
=
commission_process
.
get_source_value
(
commission_process
,
"VENTAS"
,
"COMERCIAL_BASE"
)
result
=
commission_process
.
get_commission_per_agent
(
commission_process
,
goals
,
executes
,
base
)
logger
.
info
(
f
"Duración de procesamiento en memoria: {time.time() - start_process}"
)
# Task de escritura
start_load
=
time
.
time
()
_
=
commission_process
.
write_result
(
commission_process
,
result
,
"REPORT_SUMMARY"
)
logger
.
info
(
f
"Duración de carga del reporte a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso de etl: {time.time() - start_time}"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
config.json
View file @
9c72a0a6
{
{
"identifier"
:
"BCOM-SPARK-TESTS"
,
"identifier"
:
"BCOM-SPARK-TESTS"
,
"period"
:
"202311"
,
"inputs"
:
{
"inputs"
:
{
"type"
:
"bucket"
,
"type"
:
"bucket"
,
"params"
:
{
"params"
:
{
...
@@ -19,6 +20,7 @@
...
@@ -19,6 +20,7 @@
"CONSULTOR_NK"
:
"TEXT"
,
"CONSULTOR_NK"
:
"TEXT"
,
"CLIENTE_ID"
:
"TEXT"
,
"CLIENTE_ID"
:
"TEXT"
,
"CLIENTE_NOMBRE"
:
"TEXT"
,
"CLIENTE_NOMBRE"
:
"TEXT"
,
"CLIENTE_NATURALEZA"
:
"TEXT"
,
"SERVICIO"
:
"TEXT"
,
"SERVICIO"
:
"TEXT"
,
"REVENUE"
:
"DECIMAL"
,
"REVENUE"
:
"DECIMAL"
,
"PLAN_CODIGIO_NK"
:
"TEXT"
,
"PLAN_CODIGIO_NK"
:
"TEXT"
,
...
...
etl.py
View file @
9c72a0a6
import
time
import
time
import
json
import
json
import
logging
from
typing
import
Any
,
Dict
from
typing
import
Any
,
Dict
from
prefect
import
flow
from
prefect
import
flow
,
get_run_logger
from
Pipeline.ETLProcess
import
ETLProcess
from
Pipeline.ETLProcess
import
ETLProcess
logger
=
logging
.
getLogger
()
SPARK_JARS
=
{
SPARK_JARS
=
{
"AWS_CORE"
:
"/opt/spark-jars/hadoop-aws-3.3.4.jar"
,
"AWS_CORE"
:
"/opt/spark-jars/hadoop-aws-3.3.4.jar"
,
"BUNDLE"
:
"/opt/spark-jars/aws-java-sdk-bundle-1.12.431.jar"
,
"BUNDLE"
:
"/opt/spark-jars/aws-java-sdk-bundle-1.12.431.jar"
,
...
@@ -17,23 +14,36 @@ SPARK_JARS = {
...
@@ -17,23 +14,36 @@ SPARK_JARS = {
"MONGO_CORE"
:
"/opt/spark-jars/mongodb-driver-core-4.0.4.jar"
,
"MONGO_CORE"
:
"/opt/spark-jars/mongodb-driver-core-4.0.4.jar"
,
"MONGO_CLIENT"
:
"/opt/spark-jars/mongodb-driver-sync-4.0.4.jar"
,
"MONGO_CLIENT"
:
"/opt/spark-jars/mongodb-driver-sync-4.0.4.jar"
,
"MONGODB"
:
"/opt/spark-jars/mongo-spark-connector_2.12-3.0.1.jar"
,
"MONGODB"
:
"/opt/spark-jars/mongo-spark-connector_2.12-3.0.1.jar"
,
"BSON"
:
"/opt/spark-jars/bson-4.0.4.jar"
"BSON"
:
"/opt/spark-jars/bson-4.0.4.jar"
,
"STARROK"
:
"/opt/spark-jars/starrocks-spark-connector-3.4_2.12-1.1.2.jar"
,
"MYSQL"
:
"/opt/spark-jars/mysql-connector-java-8.0.30.jar"
}
}
MONGODB_URI
=
"mongodb://bcom_spark_user:root@192.168.1.37:50001/bcom_spark"
MONGODB_URI
=
"mongodb://bcom_spark_user:root@192.168.1.37:50001/bcom_spark"
STARROK_URI
=
"jdbc:starroks://root:@ec2-3-237-32-62.compute-1.amazonaws.com:9030/bcom_spark"
@
flow
@
flow
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
def
run_etl
(
config
:
Dict
[
str
,
Any
])
->
None
:
logger
=
get_run_logger
()
start_time
=
time
.
time
()
start_time
=
time
.
time
()
etl_process
=
ETLProcess
(
config
)
etl_process
=
ETLProcess
(
config
)
# Conexion a Spark (LocalMode, StandAlone or Clúster)
# Conexion a Spark (LocalMode, StandAlone or Clúster)
etl_process
.
init
(
SPARK_JARS
,
MONGODB_URI
)
start_init
=
time
.
time
()
etl_process
.
init
(
SPARK_JARS
,
starrok_uri
=
STARROK_URI
)
logger
.
info
(
f
"Duración de creación de sesión Spark: {time.time() - start_init}"
)
# Primer task - (Reader) - Extraer los ficheros
# Primer task - (Reader) - Extraer los ficheros
start_reader
=
time
.
time
()
etl_process
.
reader
(
etl_process
)
etl_process
.
reader
(
etl_process
)
logger
.
info
(
f
"Duración de extracción de ficheros desde S3: {time.time() - start_reader}"
)
# Segundo task - Setear esquema a las tablas
# Segundo task - Setear esquema a las tablas
start_transform
=
time
.
time
()
etl_process
.
set_schema
(
etl_process
)
etl_process
.
set_schema
(
etl_process
)
# Process - Insumo Gross (Ventas)
# Process - Insumo Gross (Ventas)
...
@@ -41,8 +51,10 @@ def run_etl(config: Dict[str, Any]) -> None:
...
@@ -41,8 +51,10 @@ def run_etl(config: Dict[str, Any]) -> None:
# Process - Insumo Team (Equipos)
# Process - Insumo Team (Equipos)
teams_flag
=
etl_process
.
process_teams
.
submit
(
etl_process
,
"TEAMS"
)
teams_flag
=
etl_process
.
process_teams
.
submit
(
etl_process
,
"TEAMS"
)
logger
.
info
(
f
"Duración de transformación y limpieza de datos: {time.time() - start_transform}"
)
# Write - Insumo GROSS
# Write - Insumo GROSS
start_load
=
time
.
time
()
etl_process
.
write
.
submit
(
etl_process
,
"VENTAS"
,
ventas_flag
)
etl_process
.
write
.
submit
(
etl_process
,
"VENTAS"
,
ventas_flag
)
# Write - Insumo TEAMS
# Write - Insumo TEAMS
etl_process
.
write
.
submit
(
etl_process
,
"TEAMS"
,
teams_flag
)
etl_process
.
write
.
submit
(
etl_process
,
"TEAMS"
,
teams_flag
)
...
@@ -50,8 +62,9 @@ def run_etl(config: Dict[str, Any]) -> None:
...
@@ -50,8 +62,9 @@ def run_etl(config: Dict[str, Any]) -> None:
etl_process
.
write
.
submit
(
etl_process
,
"GOALS"
)
etl_process
.
write
.
submit
(
etl_process
,
"GOALS"
)
# Write - Insumo PLANTA
# Write - Insumo PLANTA
etl_process
.
write
.
submit
(
etl_process
,
"COMERCIAL_BASE"
)
etl_process
.
write
.
submit
(
etl_process
,
"COMERCIAL_BASE"
)
logger
.
info
(
f
"Duración de carga de datos a la BD: {time.time() - start_load}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL
: {start_time - time.time()
}"
)
logger
.
info
(
f
"Duración de ejecución del proceso ETL
General: {time.time() - start_time
}"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment