Commit 76fb4c10 authored by Erly Villaroel's avatar Erly Villaroel

Merge remote-tracking branch 'origin/developer_ca' into developer_ev

# Conflicts:
#	app/main/engine/enum/CodeResponseEnum.py
#	app/main/engine/service/Process.py
#	scripts/match-and-exclude-records-actions_v1.py
parents 016c0749 c1597525
......@@ -44,6 +44,15 @@ class Database:
except Exception as e:
self.app.logger.error(f"Error cerrando básica conexión. {e}")
def get_dialect(self) -> str:
dialect = ""
try:
dialect = self.factory.get_dialect()
except Exception as e:
self.app.logger.error(f"Error obteniendo dialect. {e}")
finally:
return dialect
def create_engine(self) -> None:
try:
if isinstance(self.engine, type(None)):
......
......@@ -23,6 +23,7 @@ class Mysql:
self.params = params
self.engine = None
self.connection = None
self.dialect = None
def create_spark_connection(self):
params = {}
......@@ -46,10 +47,17 @@ class Mysql:
finally:
return self.connection
def create_engine(self) -> None:
def get_dialect(self) -> str:
try:
dialect = DatabaseDialectEnum.MYSQL.value
url = f"{dialect}://{self.user}:{self.password}@{self.host}:{str(self.port)}/{self.database}?charset=utf8mb4"
self.dialect = f"{dialect}://{self.user}:{self.password}@{self.host}:{str(self.port)}/{self.database}?charset=utf8mb4"
except Exception as e:
self.app.logger.error(f"Error obteniendo dialect de Mysql. {e}")
return self.dialect
def create_engine(self) -> None:
try:
url = self.get_dialect()
self.engine = create_engine(url, pool_recycle=3600, pool_pre_ping=True, **self.params)
except Exception as e:
self.app.logger.error(f"Error creando engine de Mysql. {e}")
......
......@@ -11,4 +11,4 @@ class CodeResponseEnum(Enum):
OUTPUT_ERROR = 606
EMPTY_DATASET = 607
ERROR = 609
TIMEOUT_ERROR = 800
TIMEOUT = 610
......@@ -4,3 +4,4 @@ from enum import Enum
class StatusEnum(Enum):
OK = 200
ERROR = 609
TIMEOUT = 610
......@@ -2,14 +2,15 @@ from typing import Dict, Any
import time
import traceback as traceback_lib
import importlib
from threading import Timer
from config import Config as cfg
from app.main.engine.util.Timezone import Timezone
from app.main.engine.util.Utils import Utils
from app.main.engine.enum.StatusEnum import StatusEnum
from app.main.engine.enum.CodeResponseEnum import CodeResponseEnum
from app.main.engine.database.Database import Database
from wrapt_timeout_decorator import *
class Process:
def __init__(self, app, descriptor: Dict[str, Any]) -> None:
self.app = app
......@@ -21,7 +22,6 @@ class Process:
def run(self) -> Dict[str, Any]:
status, status_description = StatusEnum.OK, ""
try:
# Obteniendo la conexión a la BD
db_params = cfg.db_params
source = Database(self.app, db_params)
......@@ -44,39 +44,22 @@ class Process:
obj_script = globals()[relation](self.app)
obj_script.parser(self.descriptor)
tiempo_limite = obj_script.timeout
if tiempo_limite is not None:
@timeout(tiempo_limite)
def procesamiento():
try:
self.app.logger.info(f"Iniciando procesamiento de script")
obj_script.process(source)
# Guardando resultado
self.app.logger.info(f"Generado y guardando resultado")
response = obj_script.response()
# response.show()
result = self.utils.create_result(response, self.descriptor)
save = self.utils.save_result(result, self.descriptor, db_session)
if save["status"] == StatusEnum.ERROR.name:
raise InterruptedError(save["message"])
# Iniciando process
self.app.logger.info(f"Iniciando procesamiento de script")
obj_script.process(source)
except Exception as e:
raise TimeoutError(f"Tiempo límite de ejecución superado{e}")
procesamiento()
else:
# Iniciando process
self.app.logger.info(f"Iniciando procesamiento de script")
obj_script.process(source)
# Guardando resultado
self.app.logger.info(f"Generado y guardando resultado")
response = obj_script.response()
# response.show()
result = self.utils.create_result(response, self.descriptor)
save = self.utils.save_result(result, self.descriptor, db_session)
if save["status"] == StatusEnum.ERROR.name:
raise InterruptedError(save["message"])
# Guardando resultado
self.app.logger.info(f"Generado y guardando resultado")
response = obj_script.response()
# response.show()
result = self.utils.create_result(response, self.descriptor)
save = self.utils.save_result(result, self.descriptor, db_session)
if save["status"] == StatusEnum.ERROR.name:
raise InterruptedError(save["message"])
except TimeoutError as e:
self.app.logger.error(f"Error de Timeout. Error: {e}")
status, status_description = CodeResponseEnum.TIMEOUT, str(e)
except IndexError as e:
self.app.logger.error(f"Error extrayendo insumos. Vacío. Error: {e}")
status, status_description = CodeResponseEnum.EMPTY_DATASET, str(e)
......@@ -95,9 +78,6 @@ class Process:
except ReferenceError as e:
self.app.logger.error(f"Error validando parámetros del descriptor. {e}")
status, status_description = CodeResponseEnum.PARAMETERS_ERROR, str(e)
except TimeoutError as e:
self.app.logger.error(f"Error validando parámetros del descriptor. {e}")
status, status_description = CodeResponseEnum.TIMEOUT_ERROR, str(e)
except Exception as e:
traceback_lib.print_exc()
self.app.logger.error(f"Error procesando engine. {e}")
......
......@@ -5,6 +5,8 @@ import shutil
from enum import Enum
# from pyspark.sql import SparkSession
import json
from app.main.engine.enum.CodeResponseEnum import CodeResponseEnum
from app.main.engine.util.Timezone import Timezone
# from config import Config as cfg
......@@ -52,8 +54,11 @@ class Utils:
if codeEnum.value == StatusEnum.OK.value:
response.update({'status': StatusEnum.OK.name, 'detail': detail})
else:
error = StatusEnum.ERROR.name
if codeEnum.value == CodeResponseEnum.TIMEOUT.value:
error = StatusEnum.TIMEOUT.name
description = DescResponseEnum[codeEnum.name].value
response.update({'status': StatusEnum.ERROR.name, 'message': description,
response.update({'status': error, 'message': description,
'detail': detail})
return response
......@@ -65,6 +70,14 @@ class Utils:
pivot_params = descriptor["params-input"]["pivot-config"]
ctp_params = descriptor["params-input"]["counterpart-config"]
for key_p, key_c in zip(pivot_params.keys(), ctp_params.keys()):
if isinstance(pivot_params[key_p], str):
pivot_params[key_p] = "PIVOT_" + pivot_params[key_p]
ctp_params[key_c] = "COUNTERPART_" + ctp_params[key_c]
else:
pivot_params[key_p] = ["PIVOT_" + column for column in pivot_params[key_p]]
ctp_params[key_c] = ["COUNTERPART_" + column for column in ctp_params[key_c]]
group_pivot_match = pivot_params["columns-group"]
transaction_pivot_match = pivot_params["columns-transaction"]
......@@ -73,7 +86,7 @@ class Utils:
used_list = transaction_counterpart_match if exclude_pivot else transaction_pivot_match
if data.empty:
if data is None or data.empty:
self.app.logger.info(f"El dataframe resultado esta vacio")
else:
for idx, i in data.iterrows():
......
......@@ -23,16 +23,16 @@ app:
timezone: 'GMT-5'
time_pattern: '%Y-%m-%d %H:%M:%S'
logging: 'INFO'
max_engine_threads: 2 # threads (maximum)
max_engine_threads: 50 # threads (maximum)
# Make the service in a production state
# Manage connections to the REST Service published. Allow workers to receive the connections.
# https://docs.gunicorn.org/en/stable/
gunicorn:
bind: '0.0.0.0:7500'
bind: '0.0.0.0:8000'
worker_class: 'gthread'
threads: 8
worker_connections: 50
threads: 51
worker_connections: 51
loglevel: 'debug'
accesslog: '-'
capture_output: True
\ No newline at end of file
from typing import Any, Dict, List
import importlib.util
from itertools import combinations
import multiprocessing as mp
import numpy as np
import pandas as pd
from numba import njit
from parallel_pandas import ParallelPandas
from concurrent.futures import ThreadPoolExecutor
import json
from dask import dataframe as dd
from numba import jit, types, typed
from wrapt_timeout_decorator import timeout
from app.main.engine.action.ActionInterface import ActionInterface
......@@ -15,12 +14,6 @@ relation_classname_identifier = {
"match-and-exclude-records-actions": "MatchAndExcludeRecordsAction"
}
# CONFIGURACION DE SESION DE SPARK
MASTER = "local[*]"
DRIVER_MEMORY = "8g"
EXECUTOR_MEMORY = "8g"
MYSQL_JAR_PATH = "jars/mysql-connector-java-8.0.30.jar"
# EXCLUDE VALIDATION FIELD
EXCLUDE_ROWS_FIELD = "EXCLUDE_VALID"
# REDONDEO DE DECIMALES
......@@ -36,7 +29,6 @@ class MatchAndExcludeRecordsAction(ActionInterface):
super().__init__(app)
self.max_combinations = None
self.timeout = None
self.comb_per_group = None
self.exclude_pivot = None
self.pivot_params = None
self.ctp_params = None
......@@ -77,267 +69,257 @@ class MatchAndExcludeRecordsAction(ActionInterface):
raise ReferenceError(f"Parámetro *{param}* no encontrado en pivot o contraparte")
self.max_combinations = configs["max-records-per-combinations"]
if "max-timeout-per-combinations" in configs:
self.timeout = configs["max-timeout-per-combinations"]
self.timeout = configs["max-timeout-per-combinations"]
self.exclude_pivot = configs["exclude-entity-pivot"]
self.pivot_params = pivot_params
self.ctp_params = ctp_params
def process(self, source_obj):
# Inicializar la sesion de Spark
session = self.createSession()
# Traer la data desde BD tanto pivot como contraparte
pivot_table, ctp_table = self.pivot_params["tablename"], self.ctp_params["tablename"]
jdbc_conn = source_obj.create_spark_connection()
jdbc_url = jdbc_conn["url"]
jdbc_properties = jdbc_conn["properties"]
pivot_df = session.read.jdbc(url=jdbc_url, table=pivot_table, properties=jdbc_properties)
ctp_df = session.read.jdbc(url=jdbc_url, table=ctp_table, properties=jdbc_properties)
# Agregar un prefijo a cada columna, tanto del pivot como contraparte. Actualizar campos del input
# pivot: 'PIVOT_', contraparte: 'COUNTERPART_'
for column in pivot_df.columns:
if column == EXCLUDE_ROWS_FIELD:
continue
pivot_df = pivot_df.withColumnRenamed(column, "PIVOT_"+column)
for column in ctp_df.columns:
if column == EXCLUDE_ROWS_FIELD:
continue
ctp_df = ctp_df.withColumnRenamed(column, "COUNTERPART_"+column)
for key_p, key_c in zip(self.pivot_params.keys(), self.ctp_params.keys()):
if isinstance(self.pivot_params[key_p], str):
self.pivot_params[key_p] = "PIVOT_"+self.pivot_params[key_p]
self.ctp_params[key_c] = "COUNTERPART_"+self.ctp_params[key_c]
else:
self.pivot_params[key_p] = ["PIVOT_"+column for column in self.pivot_params[key_p]]
self.ctp_params[key_c] = ["COUNTERPART_" + column for column in self.ctp_params[key_c]]
from pyspark.sql.functions import sum, collect_list, round, when, col, lit
pivot_cols = self.pivot_params["columns-transaction"].copy()
if self.pivot_params["amount-column"] in pivot_cols:
pivot_cols.remove(self.pivot_params["amount-column"])
ctp_cols = self.ctp_params["columns-transaction"].copy()
if self.ctp_params["amount-column"] in ctp_cols:
ctp_cols.remove(self.ctp_params["amount-column"])
comb_per_group = self.comb_per_group
max_combinations = self.max_combinations
# Ejecutamos lógica de excluir registros
if len(self.pivot_params["columns-group"]) == 0 and len(self.ctp_params["columns-group"]) == 0:
raise RuntimeError(f"Debe haber al menos pivot o contraparte agrupado")
# Caso: 1 - Muchos
elif len(self.pivot_params["columns-group"]) == 0 and len(self.ctp_params["columns-group"]) > 0:
ctp_df2 = ctp_df.groupby(self.ctp_params["columns-group"]). \
agg(round(sum(self.ctp_params["amount-column"]), ROUND_DECIMAL).alias(self.ctp_params["amount-column"]),
collect_list(self.ctp_params["id-column"]).alias(self.ctp_params["id-column"]))
pivot_df2 = pivot_df
# Caso: Muchos - 1
elif len(self.pivot_params["columns-group"]) > 0 and len(self.ctp_params["columns-group"]) == 0:
pivot_df2 = pivot_df.groupby(self.pivot_params["columns-group"]).\
agg(round(sum(self.pivot_params["amount-column"]), ROUND_DECIMAL).alias(self.pivot_params["amount-column"]),
collect_list(self.pivot_params["id-column"]).alias(self.pivot_params["id-column"]))
ctp_df2 = ctp_df
# Caso: Muchos - Muchos
elif len(self.pivot_params["columns-group"]) > 0 and len(self.ctp_params["columns-group"]) > 0:
pivot_df2 = pivot_df.groupby(self.pivot_params["columns-group"]). \
agg(round(sum(self.pivot_params["amount-column"]), ROUND_DECIMAL).alias(self.pivot_params["amount-column"]),
collect_list(self.pivot_params["id-column"]).alias(self.pivot_params["id-column"]))
ctp_df2 = ctp_df.groupby(self.ctp_params["columns-group"]). \
agg(round(sum(self.ctp_params["amount-column"]), ROUND_DECIMAL).alias(self.ctp_params["amount-column"]),
collect_list(self.ctp_params["id-column"]).alias(self.ctp_params["id-column"]))
condition = [pivot_df2[col1] == ctp_df2[col2] for col1, col2 in zip(self.pivot_params["columns-transaction"],
self.ctp_params["columns-transaction"])]
total_merged = pivot_df2.join(ctp_df2, condition, 'left')
total_merged = total_merged.withColumn("DIFF", when(col(self.ctp_params["columns-transaction"][0]).isNotNull(),
lit(0)).otherwise(lit(None)))
total_merged = total_merged.select(*pivot_df2.columns, "DIFF")
condition = [total_merged[col1] == ctp_df2[col2] for col1, col2 in zip(pivot_cols, ctp_cols)]
merged = total_merged.join(ctp_df2, condition)
merged = merged.withColumn("DIFF", when(col("DIFF").isNull(),
total_merged[self.pivot_params["amount-column"]] - ctp_df2[self.ctp_params["amount-column"]]).otherwise(col("DIFF")))
merged_df = merged.withColumn("DIFF", round(merged["DIFF"], ROUND_DECIMAL))
if self.exclude_pivot:
df = pivot_df
group_cols = self.pivot_params["columns-group"]
amount_col = self.pivot_params["amount-column"]
id_col = self.pivot_params["id-column"]
else:
df = ctp_df
group_cols = self.ctp_params["columns-group"]
amount_col = self.ctp_params["amount-column"]
id_col = self.ctp_params["id-column"]
total_tmp_cols = group_cols + ["DIFF"]
df3 = df.join(merged_df.select(*total_tmp_cols), group_cols)
df3 = df3.toPandas()
total_cols = group_cols + [amount_col, id_col, EXCLUDE_ROWS_FIELD, "DIFF"]
# ParallelPandas.initialize(n_cpu=mp.cpu_count(), split_factor=8, disable_pr_bar=True)
df3 = df3.sort_values(group_cols + [amount_col])
resultado = df3[total_cols].groupby(group_cols).apply(lambda x: custom_func(x, amount_col, id_col, max_combinations))
resultado = resultado.reset_index()
if len(resultado.columns) == 1:
resultado = pd.DataFrame([], columns=group_cols + ["LISTA_DIFF"])
else:
resultado.columns = group_cols + ["LISTA_DIFF"]
# print(resultado["LISTA_DIFF"].apply(lambda x: x if pd.notna(x) and x[0]!=-1 else x))
meged2 = resultado.merge(merged_df.toPandas(), 'left', group_cols)
print(meged2)
meged2["LISTA_DIFF"] = meged2["LISTA_DIFF"].apply(self.handle_array)
meged2 = meged2[(meged2['DIFF'] == 0) | ((meged2['DIFF'] != 0) & (meged2['LISTA_DIFF'].apply(len) > 0))]
if meged2.empty:
pass
elif self.exclude_pivot:
meged2['INTER_PIVOT_ID'] = meged2.apply(lambda row: self.array_except(row[self.pivot_params["id-column"]], row['LISTA_DIFF']), axis=1)
meged2 = meged2.rename(columns={self.ctp_params["id-column"]: "INTER_CTP_ID"})
if meged2['INTER_CTP_ID'].dtype == 'int64':
merged_df['INTER_CTP_ID'] = merged_df['INTER_CTP_ID'].apply(lambda x: [x]).astype('object')
else:
meged2['INTER_CTP_ID'] = meged2.apply(lambda row: self.array_except(row[self.ctp_params["id-column"]], row['LISTA_DIFF']), axis=1)
meged2 = meged2.rename(columns={self.pivot_params["id-column"]: "INTER_PIVOT_ID"})
if meged2['INTER_PIVOT_ID'].dtype == 'int64':
merged_df['INTER_PIVOT_ID'] = merged_df['INTER_PIVOT_ID'].apply(lambda x: [x]).astype('object')
self.output = meged2
def process(self, source_obs):
try:
@timeout(self.timeout)
def __process(source_obj):
# Traer la data desde BD tanto pivot como contraparte
pivot_table, ctp_table = self.pivot_params["tablename"], self.ctp_params["tablename"]
dialect = source_obj.get_dialect()
pivot_df = dd.read_sql_table(pivot_table, dialect, index_col=self.pivot_params["id-column"],
npartitions=4)
pivot_df = pivot_df.reset_index()
ctp_df = dd.read_sql_table(ctp_table, dialect, index_col=self.ctp_params["id-column"], npartitions=4)
ctp_df = ctp_df.reset_index()
# Agregar un prefijo a cada columna, tanto del pivot como contraparte. Actualizar campos del input
# pivot: 'PIVOT_', contraparte: 'COUNTERPART_'
# Iterar sobre las columnas del DataFrame
for column in pivot_df.columns:
if column == EXCLUDE_ROWS_FIELD:
continue
new_column_name = "PIVOT_" + column
pivot_df = pivot_df.rename(columns={column: new_column_name})
for column in ctp_df.columns:
if column == EXCLUDE_ROWS_FIELD:
continue
new_column_name = "COUNTERPART_" + column
ctp_df = ctp_df.rename(columns={column: new_column_name})
for key_p, key_c in zip(self.pivot_params.keys(), self.ctp_params.keys()):
if isinstance(self.pivot_params[key_p], str):
self.pivot_params[key_p] = "PIVOT_"+self.pivot_params[key_p]
self.ctp_params[key_c] = "COUNTERPART_"+self.ctp_params[key_c]
else:
self.pivot_params[key_p] = ["PIVOT_"+column for column in self.pivot_params[key_p]]
self.ctp_params[key_c] = ["COUNTERPART_" + column for column in self.ctp_params[key_c]]
from pyspark.sql.functions import sum, collect_list, round, when, col, lit
pivot_cols = self.pivot_params["columns-transaction"].copy()
if self.pivot_params["amount-column"] in pivot_cols:
pivot_cols.remove(self.pivot_params["amount-column"])
ctp_cols = self.ctp_params["columns-transaction"].copy()
if self.ctp_params["amount-column"] in ctp_cols:
ctp_cols.remove(self.ctp_params["amount-column"])
max_combinations = self.max_combinations
# Ejecutamos lógica de excluir registros
if len(self.pivot_params["columns-group"]) == 0 and len(self.ctp_params["columns-group"]) == 0:
raise RuntimeError(f"Debe haber al menos pivot o contraparte agrupado")
# Caso: 1 - Muchos
elif len(self.pivot_params["columns-group"]) == 0 and len(self.ctp_params["columns-group"]) > 0:
ctp_df2 = ctp_df.groupby(self.ctp_params["columns-group"]).agg({
self.ctp_params["amount-column"]: 'sum', # Sumar la columna de cantidades
self.ctp_params["id-column"]: list
})
ctp_df2 = ctp_df2.reset_index()
pivot_df2 = pivot_df
# Caso: Muchos - 1
elif len(self.pivot_params["columns-group"]) > 0 and len(self.ctp_params["columns-group"]) == 0:
pivot_df2 = pivot_df.groupby(self.pivot_params["columns-group"]).agg({
self.pivot_params["amount-column"]: 'sum',
self.pivot_params["id-column"]: list
})
pivot_df2 = pivot_df2.reset_index()
ctp_df2 = ctp_df
# Caso: Muchos - Muchos
elif len(self.pivot_params["columns-group"]) > 0 and len(self.ctp_params["columns-group"]) > 0:
pivot_df2 = pivot_df.groupby(self.pivot_params["columns-group"]).agg({
self.pivot_params["amount-column"]: 'sum',
self.pivot_params["id-column"]: list
})
pivot_df2 = pivot_df2.reset_index()
ctp_df2 = ctp_df.groupby(self.ctp_params["columns-group"]).agg({
self.ctp_params["amount-column"]: 'sum', # Sumar la columna de cantidades
self.ctp_params["id-column"]: list
})
ctp_df2 = ctp_df2.reset_index()
pivot_df2[self.pivot_params["amount-column"]] = pivot_df2[self.pivot_params["amount-column"]].round(
ROUND_DECIMAL)
ctp_df2[self.ctp_params["amount-column"]] = ctp_df2[self.ctp_params["amount-column"]].round(
ROUND_DECIMAL)
total_merged = pivot_df2.merge(ctp_df2, 'left', left_on=self.pivot_params["columns-transaction"],
right_on=self.ctp_params["columns-transaction"])
total_merged = total_merged.map_partitions(self.add_diff_column)
selected_columns = list(pivot_df2.columns) + ['DIFF']
total_merged = total_merged[selected_columns]
merged = total_merged.merge(ctp_df2, 'inner', left_on=pivot_cols, right_on=ctp_cols)
merged['DIFF'] = merged['DIFF'].where(merged['DIFF'].notnull(),
merged[self.pivot_params["amount-column"]] - merged[
self.ctp_params["amount-column"]])
if len(self.pivot_params["columns-group"]) == 0 and len(self.ctp_params["columns-group"]) > 0:
merged = merged.drop_duplicates(subset=pivot_cols)
elif len(self.pivot_params["columns-group"]) > 0 and len(self.ctp_params["columns-group"]) == 0:
merged = merged.drop_duplicates(subset=ctp_cols)
merged_df = merged.assign(DIFF=lambda partition: partition["DIFF"].round(ROUND_DECIMAL))
if self.exclude_pivot:
df = pivot_df
group_cols = self.pivot_params["columns-group"]
amount_col = self.pivot_params["amount-column"]
id_col = self.pivot_params["id-column"]
else:
df = ctp_df
group_cols = self.ctp_params["columns-group"]
amount_col = self.ctp_params["amount-column"]
id_col = self.ctp_params["id-column"]
total_tmp_cols = group_cols + ["DIFF"]
df3 = df.merge(merged_df[total_tmp_cols], 'inner', on=group_cols)
df3 = df3.compute()
total_cols = group_cols + [amount_col, id_col, EXCLUDE_ROWS_FIELD, "DIFF"]
resultado = df3.groupby(group_cols)[total_cols].apply(lambda x: custom_func(x, amount_col, id_col, max_combinations))
resultado = resultado.reset_index()
if len(resultado.columns) == 1:
resultado = pd.DataFrame([], columns=group_cols + ["LISTA_DIFF"])
else:
resultado.columns = group_cols + ["LISTA_DIFF"]
resultado = dd.from_pandas(resultado, npartitions=4)
meged2 = resultado.merge(merged_df, 'left', group_cols)
meged2 = meged2.map_partitions(lambda partition: partition.assign(
LISTA_DIFF=partition['LISTA_DIFF'].apply(lambda x: [] if pd.isna(x) else x)), meta=meged2.dtypes.to_dict())
meged2 = meged2[
(meged2['DIFF'] == 0) |
((meged2['DIFF'] != 0) & meged2['LISTA_DIFF'].apply(
lambda x: True if not pd.isna(x) and ((isinstance(x, List) and len(x) > 0) or (isinstance(x, str) and len(x) > 2)) else False))
]
meged2 = meged2.compute()
if meged2.empty:
pass
elif self.exclude_pivot:
meged2['INTER_PIVOT_ID'] = meged2.apply(lambda row: self.array_except(row[self.pivot_params["id-column"]], row['LISTA_DIFF']), axis=1)
meged2 = meged2.rename(columns={self.ctp_params["id-column"]: "INTER_CTP_ID"})
if meged2['INTER_CTP_ID'].dtype == 'int64':
meged2['INTER_CTP_ID'] = meged2['INTER_CTP_ID'].apply(lambda x: [x]).astype('object')
else:
meged2['INTER_CTP_ID'] = meged2.apply(lambda row: self.array_except(row[self.ctp_params["id-column"]], row['LISTA_DIFF']), axis=1)
meged2 = meged2.rename(columns={self.pivot_params["id-column"]: "INTER_PIVOT_ID"})
if meged2['INTER_PIVOT_ID'].dtype == 'int64':
meged2['INTER_PIVOT_ID'] = meged2['INTER_PIVOT_ID'].apply(lambda x: [x]).astype('object')
return meged2
except TimeoutError as e:
raise TimeoutError(f"Tiempo límite superado. {e}")
self.output = __process(source_obs)
def response(self):
return self.output
def add_diff_column(self, partition):
partition['DIFF'] = np.where(partition[self.ctp_params["columns-transaction"][0]].notnull(), 0, np.nan)
return partition
def handle_array(self, x):
# print(type(x))
if isinstance(x, np.ndarray):
return x
else:
return []
def array_except(self, arr1, arr2):
# print(arr2)
if arr2 is None:
return arr1
else:
return [item for item in arr1 if item not in arr2]
def createSession(self, name: str = "app_engine_spark"):
try:
from pyspark.sql import SparkSession
session = SparkSession.builder.master(MASTER) \
.appName(name) \
.config("spark.jars", MYSQL_JAR_PATH) \
.config("spark.executor.extraClassPath", MYSQL_JAR_PATH) \
.config("spark.driver.extraClassPath", MYSQL_JAR_PATH) \
.config("spark.driver.memory", DRIVER_MEMORY) \
.config("spark.executor.memory", EXECUTOR_MEMORY) \
.getOrCreate()
self.app.logger.info(f"Sesión creada exitosamente")
return session
except Exception as e:
raise Exception(f"Error creando sesion Spark. {e}")
elif not isinstance(arr2, List):
cadena_sin_corchetes = arr2.strip('[]')
partes = cadena_sin_corchetes.split()
# print(partes)
arr2 = [int(numero) for numero in partes]
arr1 = json.loads(arr1.replace(" ", ""))
return [item for item in arr1 if item not in arr2]
def custom_func(group, amount_field, id_field, max_combinations):
diff = group["DIFF"].values[0]
diff_value = group["DIFF"].values[0]
if np.isnan(diff_value):
return None
diff = int(diff_value*(10**ROUND_DECIMAL))
if pd.isna(diff) or diff == 0:
return None
group = group[group[EXCLUDE_ROWS_FIELD] == 'S']
group[amount_field] = group[amount_field].astype(float)
group = group.reset_index(drop=True)
values = group[amount_field].values
values *= (10**ROUND_DECIMAL)
values = values.astype(np.int64)
ids = group[id_field].values
tam = len(values)
tam = tam if tam <= max_combinations else max_combinations
n = len(values)
valores1 = encontrar_comb_1(values, diff)
if valores1[0] != -1:
indices = ids[valores1]
return indices
valores2 = encontrar_comb_2(values, diff, n)
if valores2[0] != -1:
indices = ids[valores2]
return indices
# Iterar sobre todos los índices posibles
# valores4 = encontrar_comb_4(values, diff, n)
# if valores4[0] != -1:
# indices = ids[valores4]
# return indices
valores5 = encontrar_comb_5(values, diff, n)
if valores5[0] != -1:
indices = ids[valores5]
return indices
@njit
def encontrar_comb_1(valores, target):
indice = [-1]
for idx, value in enumerate(valores):
suma = value
if round(suma, ROUND_DECIMAL) == target:
indice = [idx for idx, val in enumerate(valores) if val in [value]]
return indice
return indice
@njit
def encontrar_comb_2(valores, target, n):
indice = [-1]
for i in range(n):
array_except = np.delete(valores, i)
for idx, value in enumerate(array_except):
suma = value + valores[i]
if round(suma, ROUND_DECIMAL) == target:
indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i]]]
return indice
return indice
@njit
def encontrar_comb_4(valores, target, n):
indice = [-1]
for i in range(n):
a1 = np.delete(valores, i)
for j in range(len(a1)):
a2 = np.delete(a1, j)
for k in range(len(a2)):
array_except = np.delete(a2, k)
for idx, value in enumerate(array_except):
suma = value + valores[i] + a1[j] + a2[k]
if round(suma, ROUND_DECIMAL) == target:
indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i], a1[j], a2[k]]]
return indice
return indice
@njit
def encontrar_comb_5(valores, target, n):
indice = [-1]
for i in range(n):
a1 = np.delete(valores, i)
for j in range(len(a1)):
a2 = np.delete(a1, j)
for k in range(len(a2)):
a3 = np.delete(a2, k)
for l in range(len(a3)):
array_except = np.delete(a2, l)
for idx, value in enumerate(array_except):
suma = value + valores[i] + a1[j] + a2[k] + a3[l]
if round(suma, ROUND_DECIMAL) == target:
indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i], a1[j], a2[k], a3[l]]]
return indice
return indice
result = subset_sum_iter(values, diff, tam)
indices = ids[np.isin(values, result)]
return indices
@jit(nopython=False)
def subset_sum_iter(numbers, target, num_elements):
# Initialize solutions list
final = typed.List.empty_list(types.int64)
for step in range(1, num_elements+1):
# Build first index by taking the first num_elements from the numbers
indices = list(range(step))
while True:
for i in range(step):
if indices[i] != i + len(numbers) - step:
break
else:
# No combinations left
break
# Increase current index and all its following ones
indices[i] += 1
for j in range(i + 1, step):
indices[j] = indices[j - 1] + 1
# Check current solution
solution = typed.List.empty_list(types.int64)
for i in indices:
solution.append(numbers[i])
if round(sum(solution), ROUND_DECIMAL) == target:
final = solution
break
if len(final) > 0:
break
return final
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment