Commit 9c72a0a6 authored by Cristian Aguirre's avatar Cristian Aguirre

Add starroks.py

parent 15b44e0a
This diff is collapsed.
......@@ -19,8 +19,8 @@ class ETLProcess:
self.inputs = {}
def init(self, spark_jars: Dict[str, str], mongodb_uri: str = "") -> None:
self.session = createSession(self.identifier, spark_jars, mongodb_uri)
def init(self, spark_jars: Dict[str, str], mongodb_uri: str = "", starrok_uri: str = "") -> None:
self.session = createSession(self.identifier, spark_jars, mongodb_uri, starrok_uri)
@task
def reader(self) -> None:
......@@ -79,7 +79,7 @@ class ETLProcess:
self.inputs[identifier] = self.inputs[identifier].withColumn("TIPO_CANAL", lit("DIRECT"))
success = True
except Exception as e:
raise AssertionError(f"Error transformando archivo gross. {e}")
logger.error(f"Error transformando archivo gross. {e}")
finally:
return success
......@@ -100,8 +100,15 @@ class ETLProcess:
@task
def write(self, identifier: str, prev_status: bool = True) -> None:
try:
self.inputs[identifier].printSchema()
self.inputs[identifier].write.format("com.mongodb.spark.sql.DefaultSource"). \
option("collection", identifier).mode("append").save()
# self.inputs[identifier].write.format("starrocks"). \
# option("dbtable", identifier).mode("overwrite").save()
self.inputs[identifier].write.format("starrocks") \
.option("starrocks.fe.http.url", "ec2-34-231-243-52.compute-1.amazonaws.com:8030") \
.option("starrocks.fe.jdbc.url", "jdbc:mysql://ec2-34-231-243-52.compute-1.amazonaws.com:9030/bcom_spark") \
.option("starrocks.table.identifier", "bcom_spark."+identifier) \
.option("starrocks.user", "root") \
.option("starrocks.password", "") \
.mode("append") \
.save()
except Exception as e:
logger.error(f"Erro guardando resultados. {e}")
import logging
from typing import Dict, Any
from pyspark.sql import SparkSession, DataFrame
from prefect import flow, task
from Input.Source import Input
logger = logging.getLogger()
class Process:
def __init__(self, config: Dict[str, Any]) -> None:
self.conf = config
self.identifier = self.conf["identifier"]
self.session = None
self.inputs = {}
def init(self) -> None:
self._createSession()
def get_inputs(self) -> None:
try:
pass
except Exception as e:
raise AssertionError(f"Error in function 'get_inputs'. {e}")
def run(self) -> None:
# Get inputs
self.get_inputs()
from typing import Dict
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, DataFrame
import logging
logger = logging.getLogger()
def createSession(name: str, spark_jars: Dict[str, str], mongodb_uri: str = "") -> SparkSession:
def createSession(name: str, spark_jars: Dict[str, str], mongodb_uri: str, starrok_uri: str) -> SparkSession:
session = None
try:
jars = list(spark_jars.values())
jars = ",".join(jars)
print(jars)
session = SparkSession.builder \
.appName(name) \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
......@@ -20,10 +19,41 @@ def createSession(name: str, spark_jars: Dict[str, str], mongodb_uri: str = "")
.config("spark.jars", jars) \
.config("spark.executor.extraClassPath", jars) \
.config("spark.driver.extraClassPath", jars) \
.config("spark.mongodb.input.uri", mongodb_uri) \
.config("spark.mongodb.output.uri", mongodb_uri) \
.getOrCreate()
# .config("spark.starrocks.url", starrok_uri) \
# .config("spark.starrocks.driver", "com.starroks.jdbc.Driver") \
# .config("spark.sql.catalogImplementation", "in-memory") \
# .getOrCreate()
session._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
except Exception as e:
logger.error(f"Error creando sesion. {e}")
finally:
return session
def get_goal_by_kpi(df: DataFrame, agent: str, period: str, kpi: str) -> float:
result = 0.0
try:
df = df.filter((df["CEDULA"] == agent) & (df["PERIODO_PROCESO_CODIGO"] == period) & (df["KPI"] == kpi)). \
select("META_FINAL")
if df.count() != 0:
results = [row[0] for row in df.select("META_FINAL").collect()]
result = results[0]
except Exception as e:
logger.error(f"Error obteniendo meta por kpi. {e}")
finally:
return result
def get_execute_by_service(df: DataFrame, agent: str, period: str, segment: str) -> int:
result = 0
try:
df = df.filter((df["AGENTE_COMISIONA"] == agent) & (df["PERIODO_PROCESO_CODIGO"] == period) &
(df["SEGMENTO"] == segment))
result = df.count()
except Exception as e:
logger.error(f"Error obteniendo meta por segmento. {e}")
finally:
return result
import time
import json
import logging
from typing import Any, Dict
from prefect import flow
from prefect import flow, get_run_logger
from Pipeline.CommissionProcess import CommissionProcess
logger = logging.getLogger()
SPARK_JARS = {
"MONGO_CORE": "/opt/spark-jars/mongodb-driver-core-4.0.4.jar",
"MONGO_CLIENT": "/opt/spark-jars/mongodb-driver-sync-4.0.4.jar",
"MONGODB": "/opt/spark-jars/mongo-spark-connector_2.12-3.0.1.jar",
"BSON": "/opt/spark-jars/bson-4.0.4.jar"
}
MONGODB_URI = "mongodb://bcom_spark_user:root@192.168.1.37:50001/bcom_spark"
@flow()
def run_commission(config: Dict[str, Any]) -> None:
logger = get_run_logger()
start_time = time.time()
logger.info(f"Duración de ejecución del proceso de liquidación: {start_time - time.time()}")
commission_process = CommissionProcess(config)
# Conexion a Spark (LocalMode, StandAlone or Clúster)
start_init = time.time()
commission_process.init(SPARK_JARS, MONGODB_URI)
logger.info(f"Duración de creación de sesión Spark: {time.time() - start_init}")
# Primer task - Extraer la data - RECORDAR: SPARK ES LAZY!!!
start_reader = time.time()
commission_process.get_inputs(commission_process)
logger.info(f"Duración de extracción de datos desde la BD: {time.time() - start_reader}")
# Tercer task - Obtener metas
start_process = time.time()
goals = commission_process.get_goals_second_way(commission_process, "VENTAS", "GOALS")
# Quinto task - Obtener ejecutados - ¿Aplicar tmb filtro de FLAG_COMISIONABLE y ACTIVE_USER_TRAFFIC?
executes = commission_process.get_executed_second_way(commission_process, "VENTAS", "TEAMS")
# Sexo task - Obtener monto origen
base = commission_process.get_source_value(commission_process, "VENTAS", "COMERCIAL_BASE")
result = commission_process.get_commission_per_agent(commission_process, goals, executes, base)
logger.info(f"Duración de procesamiento en memoria: {time.time() - start_process}")
# Task de escritura
start_load = time.time()
_ = commission_process.write_result(commission_process, result, "REPORT_SUMMARY")
logger.info(f"Duración de carga del reporte a la BD: {time.time() - start_load}")
logger.info(f"Duración de ejecución del proceso de etl: {time.time() - start_time}")
if __name__ == "__main__":
......
{
"identifier": "BCOM-SPARK-TESTS",
"period": "202311",
"inputs": {
"type": "bucket",
"params": {
......@@ -19,6 +20,7 @@
"CONSULTOR_NK": "TEXT",
"CLIENTE_ID": "TEXT",
"CLIENTE_NOMBRE": "TEXT",
"CLIENTE_NATURALEZA": "TEXT",
"SERVICIO": "TEXT",
"REVENUE": "DECIMAL",
"PLAN_CODIGIO_NK": "TEXT",
......
import time
import json
import logging
from typing import Any, Dict
from prefect import flow
from prefect import flow, get_run_logger
from Pipeline.ETLProcess import ETLProcess
logger = logging.getLogger()
SPARK_JARS = {
"AWS_CORE": "/opt/spark-jars/hadoop-aws-3.3.4.jar",
"BUNDLE": "/opt/spark-jars/aws-java-sdk-bundle-1.12.431.jar",
......@@ -17,23 +14,36 @@ SPARK_JARS = {
"MONGO_CORE": "/opt/spark-jars/mongodb-driver-core-4.0.4.jar",
"MONGO_CLIENT": "/opt/spark-jars/mongodb-driver-sync-4.0.4.jar",
"MONGODB": "/opt/spark-jars/mongo-spark-connector_2.12-3.0.1.jar",
"BSON": "/opt/spark-jars/bson-4.0.4.jar"
"BSON": "/opt/spark-jars/bson-4.0.4.jar",
"STARROK": "/opt/spark-jars/starrocks-spark-connector-3.4_2.12-1.1.2.jar",
"MYSQL": "/opt/spark-jars/mysql-connector-java-8.0.30.jar"
}
MONGODB_URI = "mongodb://bcom_spark_user:root@192.168.1.37:50001/bcom_spark"
STARROK_URI = "jdbc:starroks://root:@ec2-3-237-32-62.compute-1.amazonaws.com:9030/bcom_spark"
@flow
def run_etl(config: Dict[str, Any]) -> None:
logger = get_run_logger()
start_time = time.time()
etl_process = ETLProcess(config)
# Conexion a Spark (LocalMode, StandAlone or Clúster)
etl_process.init(SPARK_JARS, MONGODB_URI)
start_init = time.time()
etl_process.init(SPARK_JARS, starrok_uri=STARROK_URI)
logger.info(f"Duración de creación de sesión Spark: {time.time() - start_init}")
# Primer task - (Reader) - Extraer los ficheros
start_reader = time.time()
etl_process.reader(etl_process)
logger.info(f"Duración de extracción de ficheros desde S3: {time.time() - start_reader}")
# Segundo task - Setear esquema a las tablas
start_transform = time.time()
etl_process.set_schema(etl_process)
# Process - Insumo Gross (Ventas)
......@@ -41,8 +51,10 @@ def run_etl(config: Dict[str, Any]) -> None:
# Process - Insumo Team (Equipos)
teams_flag = etl_process.process_teams.submit(etl_process, "TEAMS")
logger.info(f"Duración de transformación y limpieza de datos: {time.time() - start_transform}")
# Write - Insumo GROSS
start_load = time.time()
etl_process.write.submit(etl_process, "VENTAS", ventas_flag)
# Write - Insumo TEAMS
etl_process.write.submit(etl_process, "TEAMS", teams_flag)
......@@ -50,8 +62,9 @@ def run_etl(config: Dict[str, Any]) -> None:
etl_process.write.submit(etl_process, "GOALS")
# Write - Insumo PLANTA
etl_process.write.submit(etl_process, "COMERCIAL_BASE")
logger.info(f"Duración de carga de datos a la BD: {time.time() - start_load}")
logger.info(f"Duración de ejecución del proceso ETL: {start_time - time.time()}")
logger.info(f"Duración de ejecución del proceso ETL General: {time.time() - start_time}")
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment