Update action-exclude-records-v1

b0fd6670 · Cristian Aguirre · 06da121b · b0fd6670 · b0fd6670 · b0fd6670
Commit b0fd6670 authored May 06, 2024 by Cristian Aguirre
5 changed files
--- a/app/main/engine/enum/CodeResponseEnum.py
+++ b/app/main/engine/enum/CodeResponseEnum.py
@@ -11,3 +11,4 @@ class CodeResponseEnum(Enum):
    OUTPUT_ERROR = 606
    EMPTY_DATASET = 607
    ERROR = 609
+    TIMEOUT = 610
--- a/app/main/engine/enum/StatusEnum.py
+++ b/app/main/engine/enum/StatusEnum.py
@@ -4,3 +4,4 @@ from enum import Enum
 class StatusEnum(Enum):
    OK = 200
    ERROR = 609
+    TIMEOUT = 610
--- a/app/main/engine/service/Process.py
+++ b/app/main/engine/service/Process.py
@@ -57,6 +57,9 @@ class Process:
            save = self.utils.save_result(result, self.descriptor, db_session)
            if save["status"] == StatusEnum.ERROR.name:
                raise InterruptedError(save["message"])
+        except TimeoutError as e:
+            self.app.logger.error(f"Error de Timeout. Error: {e}")
+            status, status_description = CodeResponseEnum.TIMEOUT, str(e)
        except IndexError as e:
            self.app.logger.error(f"Error extrayendo insumos. Vacío. Error: {e}")
            status, status_description = CodeResponseEnum.EMPTY_DATASET, str(e)

--- a/app/main/engine/util/Utils.py
+++ b/app/main/engine/util/Utils.py
@@ -5,6 +5,8 @@ import shutil
 from enum import Enum
 # from pyspark.sql import SparkSession
 import json
+from app.main.engine.enum.CodeResponseEnum import CodeResponseEnum
 from app.main.engine.util.Timezone import Timezone
 # from config import Config as cfg
@@ -52,8 +54,11 @@ class Utils:
        if codeEnum.value == StatusEnum.OK.value:
            response.update({'status': StatusEnum.OK.name, 'detail': detail})
        else:
+            error = StatusEnum.ERROR.name
+            if codeEnum.value == CodeResponseEnum.TIMEOUT.value:
+                error = StatusEnum.TIMEOUT.name
            description = DescResponseEnum[codeEnum.name].value
-            response.update({'status': StatusEnum.ERROR.name, 'message': description,
+            response.update({'status': error, 'message': description,
                             'detail': detail})
        return response
@@ -65,6 +70,14 @@ class Utils:
            pivot_params = descriptor["params-input"]["pivot-config"]
            ctp_params = descriptor["params-input"]["counterpart-config"]
+            for key_p, key_c in zip(pivot_params.keys(), ctp_params.keys()):
+                if isinstance(pivot_params[key_p], str):
+                    pivot_params[key_p] = "PIVOT_" + pivot_params[key_p]
+                    ctp_params[key_c] = "COUNTERPART_" + ctp_params[key_c]
+                else:
+                    pivot_params[key_p] = ["PIVOT_" + column for column in pivot_params[key_p]]
+                    ctp_params[key_c] = ["COUNTERPART_" + column for column in ctp_params[key_c]]
            group_pivot_match = pivot_params["columns-group"]
            transaction_pivot_match = pivot_params["columns-transaction"]
@@ -73,7 +86,7 @@ class Utils:
            used_list = transaction_counterpart_match if exclude_pivot else transaction_pivot_match
-            if data.empty:
+            if data is None or data.empty:
                self.app.logger.info(f"El dataframe resultado esta vacio")
            else:
                for idx, i in data.iterrows():

--- a/scripts/match-and-exclude-records-actions_v1.py
+++ b/scripts/match-and-exclude-records-actions_v1.py
-from typing import Any, Dict, List
+from typing import Any, Dict
 import importlib.util
-from itertools import combinations
-import multiprocessing as mp
 import numpy as np
 import pandas as pd
-from numba import njit
+import multiprocessing as mp
 from parallel_pandas import ParallelPandas
-from concurrent.futures import ThreadPoolExecutor
+from wrapt_timeout_decorator import timeout
 from app.main.engine.action.ActionInterface import ActionInterface
@@ -35,12 +33,12 @@ class MatchAndExcludeRecordsAction(ActionInterface):
    def __init__(self, app) -> None:
        super().__init__(app)
        self.max_combinations = None
-        self.comb_per_group = None
+        self.timeout = None
        self.exclude_pivot = None
        self.pivot_params = None
        self.ctp_params = None
        self.output = None
-        self.config_params = ["max-records-per-combinations", "max-combinations-per-group", "exclude-entity-pivot"]
+        self.config_params = ["max-records-per-combinations", "max-timeout-per-combinations", "exclude-entity-pivot"]
    def parser(self, descriptor: Dict[str, Any]):
        # Validar si pyspark y su versión está instalada
@@ -76,13 +74,15 @@ class MatchAndExcludeRecordsAction(ActionInterface):
                raise ReferenceError(f"Parámetro *{param}* no encontrado en pivot o contraparte")
        self.max_combinations = configs["max-records-per-combinations"]
-        self.comb_per_group = configs["max-combinations-per-group"]
+        self.timeout = configs["max-timeout-per-combinations"]
        self.exclude_pivot = configs["exclude-entity-pivot"]
        self.pivot_params = pivot_params
        self.ctp_params = ctp_params
-    def process(self, source_obj):
+    def process(self, source_obs):
+        try:
+            @timeout(self.timeout)
+            def __process(source_obj):
                # Inicializar la sesion de Spark
                session = self.createSession()
@@ -124,7 +124,6 @@ class MatchAndExcludeRecordsAction(ActionInterface):
                if self.ctp_params["amount-column"] in ctp_cols:
                    ctp_cols.remove(self.ctp_params["amount-column"])
-        comb_per_group = self.comb_per_group
                max_combinations = self.max_combinations
                # Ejecutamos lógica de excluir registros
@@ -188,20 +187,20 @@ class MatchAndExcludeRecordsAction(ActionInterface):
                df3 = df3.toPandas()
                total_cols = group_cols + [amount_col, id_col, EXCLUDE_ROWS_FIELD,  "DIFF"]
-        # ParallelPandas.initialize(n_cpu=mp.cpu_count(), split_factor=8, disable_pr_bar=True)
+                ParallelPandas.initialize(n_cpu=mp.cpu_count(), split_factor=8, disable_pr_bar=True)
                df3 = df3.sort_values(group_cols + [amount_col])
-        resultado = df3[total_cols].groupby(group_cols).apply(lambda x: custom_func(x, amount_col, id_col, max_combinations))
+                resultado = df3[total_cols].groupby(group_cols).p_apply(lambda x: custom_func(x, amount_col, id_col, max_combinations))
                resultado = resultado.reset_index()
                if len(resultado.columns) == 1:
                    resultado = pd.DataFrame([], columns=group_cols + ["LISTA_DIFF"])
                else:
                    resultado.columns = group_cols + ["LISTA_DIFF"]
-        # print(resultado["LISTA_DIFF"].apply(lambda x: x if pd.notna(x) and x[0]!=-1 else x))
                meged2 = resultado.merge(merged_df.toPandas(), 'left', group_cols)
-        print(meged2)
                meged2["LISTA_DIFF"] = meged2["LISTA_DIFF"].apply(self.handle_array)
                meged2 = meged2[(meged2['DIFF'] == 0) | ((meged2['DIFF'] != 0) & (meged2['LISTA_DIFF'].apply(len) > 0))]
                if meged2.empty:
                    pass
                elif self.exclude_pivot:
@@ -215,7 +214,12 @@ class MatchAndExcludeRecordsAction(ActionInterface):
                    if meged2['INTER_PIVOT_ID'].dtype == 'int64':
                        merged_df['INTER_PIVOT_ID'] = merged_df['INTER_PIVOT_ID'].apply(lambda x: [x]).astype('object')
-        self.output = meged2
+                return meged2
+        except TimeoutError as e:
+            raise TimeoutError(f"Tiempo límite superado. {e}")
+        self.output = __process(source_obs)
    def response(self):
        return self.output
@@ -251,91 +255,57 @@ class MatchAndExcludeRecordsAction(ActionInterface):
 def custom_func(group, amount_field, id_field, max_combinations):
-    diff = group["DIFF"].values[0]
+    diff = int(group["DIFF"].values[0]*(10**ROUND_DECIMAL))
    if pd.isna(diff) or diff == 0:
        return None
    group = group[group[EXCLUDE_ROWS_FIELD] == 'S']
    group[amount_field] = group[amount_field].astype(float)
    group = group.reset_index(drop=True)
    values = group[amount_field].values
+    values *= (10**ROUND_DECIMAL)
+    values = values.astype(np.int64)
    ids = group[id_field].values
+    tam = len(values)
+    tam = tam if tam <= max_combinations else max_combinations
-    n = len(values)
+    result = subset_sum_iter(values, diff, tam)
-    valores1 = encontrar_comb_1(values, diff)
+    indices = ids[np.isin(values, result)]
-    if valores1[0] != -1:
-        indices = ids[valores1]
-        return indices
-    valores2 = encontrar_comb_2(values, diff, n)
-    if valores2[0] != -1:
-        indices = ids[valores2]
-        return indices
-    # Iterar sobre todos los índices posibles
-    # valores4 = encontrar_comb_4(values, diff, n)
-    # if valores4[0] != -1:
-    #     indices = ids[valores4]
-    #     return indices
-    valores5 = encontrar_comb_5(values, diff, n)
-    if valores5[0] != -1:
-        indices = ids[valores5]
    return indices
-@njit
+def subset_sum_iter(numbers, target, num_elements):
-def encontrar_comb_1(valores, target):
-    indice = [-1]
+    # Initialize solutions list
-    for idx, value in enumerate(valores):
+    solutions = []
-        suma = value
+    for step in range(1, num_elements+1):
-        if round(suma, ROUND_DECIMAL) == target:
+        # Build first index by taking the first num_elements from the numbers
-            indice = [idx for idx, val in enumerate(valores) if val in [value]]
+        indices = list(range(step))
-            return indice
+        solution = [numbers[i] for i in indices]
+        if sum(solution) == target:
-    return indice
+            solutions.append(solution)
-@njit
+        # We iterate over the rest of the indices until we have tried all combinations
-def encontrar_comb_2(valores, target, n):
+        while True:
-    indice = [-1]
+            for i in range(step):
-    for i in range(n):
+                if indices[i] != i + len(numbers) - step:
-        array_except = np.delete(valores, i)
+                    break
-        for idx, value in enumerate(array_except):
+            else:
-            suma = value + valores[i]
+                # No combinations left
-            if round(suma, ROUND_DECIMAL) == target:
+                break
-                indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i]]]
-                return indice
+            # Increase current index and all its following ones
+            indices[i] += 1
-    return indice
+            for j in range(i + 1, step):
+                indices[j] = indices[j - 1] + 1
-@njit
-def encontrar_comb_4(valores, target, n):
+            # Check current solution
-    indice = [-1]
+            solution = [numbers[i] for i in indices]
-    for i in range(n):
+            if round(sum(solution), ROUND_DECIMAL) == target:
-        a1 = np.delete(valores, i)
+                solutions.append(solution)
-        for j in range(len(a1)):
+                break
-            a2 = np.delete(a1, j)
+        if len(solutions) > 0:
-            for k in range(len(a2)):
+            solutions = solutions[0]
-                array_except = np.delete(a2, k)
+            break
-                for idx, value in enumerate(array_except):
-                    suma = value + valores[i] + a1[j] + a2[k]
+    return solutions
-                    if round(suma, ROUND_DECIMAL) == target:
-                        indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i], a1[j], a2[k]]]
-                        return indice
-    return indice
-@njit
-def encontrar_comb_5(valores, target, n):
-    indice = [-1]
-    for i in range(n):
-        a1 = np.delete(valores, i)
-        for j in range(len(a1)):
-            a2 = np.delete(a1, j)
-            for k in range(len(a2)):
-                a3 = np.delete(a2, k)
-                for l in range(len(a3)):
-                    array_except = np.delete(a2, l)
-                    for idx, value in enumerate(array_except):
-                        suma = value + valores[i] + a1[j] + a2[k] + a3[l]
-                        if round(suma, ROUND_DECIMAL) == target:
-                            indice = [idx for idx, val in enumerate(valores) if val in [value, valores[i], a1[j], a2[k], a3[l]]]
-                            return indice
-    return indice