Commit b9c35909 authored by Cristian Aguirre's avatar Cristian Aguirre

Update 15-06-23. Delete period functionality. Add group_input_interval_days...

Update 15-06-23. Delete period functionality. Add group_input_interval_days functionality where there is an allowed interval (days) between inputs
parent 0e6ab3de
...@@ -6,8 +6,8 @@ general: ...@@ -6,8 +6,8 @@ general:
dags: dags:
dag1: dag1:
schedule: "0 */2 * * *" schedule: "@once"
period_pattern: '[a-zA-Z0-9]+([0-9]{4})(\-[0-9]{2})?\.[a-zA-Z]*' group_input_interval_days: '7'
csv_delimiter: ";" csv_delimiter: ";"
filters: filters:
fields_omited: ["BCOM_ROW_IDENTIFIER", "BCOM_PROCESS_ID", "BCOM_FIELD_KEY", "BCOM_LNUMBER", "BCOM_ERROR_CODE", fields_omited: ["BCOM_ROW_IDENTIFIER", "BCOM_PROCESS_ID", "BCOM_FIELD_KEY", "BCOM_LNUMBER", "BCOM_ERROR_CODE",
......
import fnmatch import fnmatch
import datetime import datetime
from typing import Any, Dict, Set from typing import Any, Dict
import pytz import pytz
import re
from io import BytesIO, StringIO from io import BytesIO, StringIO
import pandas as pd import pandas as pd
...@@ -16,14 +15,15 @@ import logging ...@@ -16,14 +15,15 @@ import logging
logger = logging.getLogger() logger = logging.getLogger()
def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str) -> Dict[str, Any]: def get_df_from_s3(conn: str, bucket: str, key: str, delimiter: str, base_date: datetime.date,
interval: str) -> Dict[str, Any]:
response = {'filename': "", 'df': None} response = {'filename': "", 'df': None}
dataframe = None dataframe = None
try: try:
s3_data = get_data_from_s3(conn, bucket, key, period) s3_data = get_data_from_s3(conn, bucket, key, base_date, interval)
logger.info(f"ARCHIVO EXTRAIDO: {s3_data}") logger.info(f"ARCHIVO EXTRAIDO: {s3_data}")
if s3_data["filename"] == "": if s3_data["filename"] == "":
raise Exception(f"No se encontró archivo para el key: {key} y periodo {period}") raise Exception(f"No se encontró archivo para el key: {key} y fecha base {base_date} en intervalo {interval}")
response.update({'filename': s3_data["filename"]}) response.update({'filename': s3_data["filename"]})
file_type = get_type_file(s3_data["filename"]) file_type = get_type_file(s3_data["filename"])
if file_type == FileTypeEnum.EXCEL: if file_type == FileTypeEnum.EXCEL:
...@@ -36,11 +36,11 @@ def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str ...@@ -36,11 +36,11 @@ def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str
dataframe = pd.read_csv(data, sep=delimiter, dtype='object') dataframe = pd.read_csv(data, sep=delimiter, dtype='object')
response.update({'df': dataframe}) response.update({'df': dataframe})
except Exception as e: except Exception as e:
logger.error(f"Error trayendo y transformando a DataFrame desde S3 con periodo {period}. {e}") logger.error(f"Error trayendo y transformando a DataFrame desde S3. {e}")
return response return response
def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str, Any]: def get_data_from_s3(conn: str, bucket: str, key: str, base_date: datetime.date, interval: str) -> Dict[str, Any]:
result = {'filename': '', 'data': BytesIO()} result = {'filename': '', 'data': BytesIO()}
utc = pytz.UTC utc = pytz.UTC
try: try:
...@@ -50,25 +50,26 @@ def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str, ...@@ -50,25 +50,26 @@ def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str,
prefix = "" prefix = ""
s3_hook = S3Hook(conn) s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix) files = s3_hook.list_keys(bucket, prefix)
last_key = ("", base_date)
# Colocar una fecha muy atrás como base # Colocar una fecha muy atrás como base
last_key = ("", datetime.datetime(2000, 1, 1, 0, 0, 0).replace(tzinfo=utc))
for file_key in files: for file_key in files:
if fnmatch.fnmatch(file_key, key) and (file_key.find(period) != -1 or file_key.find(period.replace("-", "")) != -1): if fnmatch.fnmatch(file_key, key):
file_date = s3_hook.get_key(file_key, bucket).meta.data file_date = s3_hook.get_key(file_key, bucket).meta.data
file_date = file_date["LastModified"] file_date = file_date["LastModified"]
if last_key[1] >= file_date: if int(interval) - abs((file_date - last_key[1]).days) >= 0:
continue last_key = (file_key, file_date)
last_key = (file_key, file_date)
data = s3_hook.get_key(last_key[0], bucket) data = s3_hook.get_key(last_key[0], bucket)
data.download_fileobj(result["data"]) data.download_fileobj(result["data"])
result["filename"] = last_key[0] result["filename"] = last_key[0]
except Exception as e: except Exception as e:
logger.error(f"Error trayendo datos desde S3 para el key {key} y periodo {period}. {e}") logger.error(f"Error trayendo datos desde S3 para el key {key}. {e}")
return result return result
def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) -> Set[str]: def get_base_date(conn: str, bucket: str, key: str) -> datetime.date:
periods = set() utc = pytz.UTC
# Colocar una fecha muy atrás como base
last_date = datetime.datetime(2000, 1, 1, 0, 0, 0).replace(tzinfo=utc)
try: try:
if key.rfind("/") != -1: if key.rfind("/") != -1:
prefix = key[:key.rfind("/") + 1] prefix = key[:key.rfind("/") + 1]
...@@ -76,19 +77,17 @@ def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) - ...@@ -76,19 +77,17 @@ def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) -
prefix = "" prefix = ""
s3_hook = S3Hook(conn) s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix) files = s3_hook.list_keys(bucket, prefix)
for file in files: for file_key in files:
if not re.search(pattern, file): if fnmatch.fnmatch(file_key, key):
continue file_date = s3_hook.get_key(file_key, bucket).meta.data
if file[file.rfind(".")-6:file.rfind(".")].isdigit(): file_date = file_date["LastModified"]
period = file[file.rfind(".")-6:file.rfind(".")] if last_date >= file_date:
else: continue
period = file[file.rfind(".")-7:file.rfind(".")] last_date = file_date
if period.find("-") == -1: logger.debug(f"Fecha base desde {key} : {last_date}")
period = period[1:5] + "-" + period[5:]
periods.add(period)
except Exception as e: except Exception as e:
logger.error(f"Error buscando periodos disponibles en los archivos. key: {key}. {e}") logger.error(f"Error buscando archivo base para tener la fecha base. key: {key}. {e}")
return set(periods) return last_date
def save_df_to_s3(df: pd.DataFrame, conn: str, bucket: str, key: str, delimiter: str = ","): def save_df_to_s3(df: pd.DataFrame, conn: str, bucket: str, key: str, delimiter: str = ","):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment