Commit b9c35909 authored by Cristian Aguirre's avatar Cristian Aguirre

Update 15-06-23. Delete period functionality. Add group_input_interval_days...

Update 15-06-23. Delete period functionality. Add group_input_interval_days functionality where there is an allowed interval (days) between inputs
parent 0e6ab3de
......@@ -6,8 +6,8 @@ general:
dags:
dag1:
schedule: "0 */2 * * *"
period_pattern: '[a-zA-Z0-9]+([0-9]{4})(\-[0-9]{2})?\.[a-zA-Z]*'
schedule: "@once"
group_input_interval_days: '7'
csv_delimiter: ";"
filters:
fields_omited: ["BCOM_ROW_IDENTIFIER", "BCOM_PROCESS_ID", "BCOM_FIELD_KEY", "BCOM_LNUMBER", "BCOM_ERROR_CODE",
......
import fnmatch
import datetime
from typing import Any, Dict, Set
from typing import Any, Dict
import pytz
import re
from io import BytesIO, StringIO
import pandas as pd
......@@ -16,14 +15,15 @@ import logging
logger = logging.getLogger()
def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str) -> Dict[str, Any]:
def get_df_from_s3(conn: str, bucket: str, key: str, delimiter: str, base_date: datetime.date,
interval: str) -> Dict[str, Any]:
response = {'filename': "", 'df': None}
dataframe = None
try:
s3_data = get_data_from_s3(conn, bucket, key, period)
s3_data = get_data_from_s3(conn, bucket, key, base_date, interval)
logger.info(f"ARCHIVO EXTRAIDO: {s3_data}")
if s3_data["filename"] == "":
raise Exception(f"No se encontró archivo para el key: {key} y periodo {period}")
raise Exception(f"No se encontró archivo para el key: {key} y fecha base {base_date} en intervalo {interval}")
response.update({'filename': s3_data["filename"]})
file_type = get_type_file(s3_data["filename"])
if file_type == FileTypeEnum.EXCEL:
......@@ -36,11 +36,11 @@ def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str
dataframe = pd.read_csv(data, sep=delimiter, dtype='object')
response.update({'df': dataframe})
except Exception as e:
logger.error(f"Error trayendo y transformando a DataFrame desde S3 con periodo {period}. {e}")
logger.error(f"Error trayendo y transformando a DataFrame desde S3. {e}")
return response
def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str, Any]:
def get_data_from_s3(conn: str, bucket: str, key: str, base_date: datetime.date, interval: str) -> Dict[str, Any]:
result = {'filename': '', 'data': BytesIO()}
utc = pytz.UTC
try:
......@@ -50,25 +50,26 @@ def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str,
prefix = ""
s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix)
last_key = ("", base_date)
# Colocar una fecha muy atrás como base
last_key = ("", datetime.datetime(2000, 1, 1, 0, 0, 0).replace(tzinfo=utc))
for file_key in files:
if fnmatch.fnmatch(file_key, key) and (file_key.find(period) != -1 or file_key.find(period.replace("-", "")) != -1):
if fnmatch.fnmatch(file_key, key):
file_date = s3_hook.get_key(file_key, bucket).meta.data
file_date = file_date["LastModified"]
if last_key[1] >= file_date:
continue
last_key = (file_key, file_date)
if int(interval) - abs((file_date - last_key[1]).days) >= 0:
last_key = (file_key, file_date)
data = s3_hook.get_key(last_key[0], bucket)
data.download_fileobj(result["data"])
result["filename"] = last_key[0]
except Exception as e:
logger.error(f"Error trayendo datos desde S3 para el key {key} y periodo {period}. {e}")
logger.error(f"Error trayendo datos desde S3 para el key {key}. {e}")
return result
def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) -> Set[str]:
periods = set()
def get_base_date(conn: str, bucket: str, key: str) -> datetime.date:
utc = pytz.UTC
# Colocar una fecha muy atrás como base
last_date = datetime.datetime(2000, 1, 1, 0, 0, 0).replace(tzinfo=utc)
try:
if key.rfind("/") != -1:
prefix = key[:key.rfind("/") + 1]
......@@ -76,19 +77,17 @@ def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) -
prefix = ""
s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix)
for file in files:
if not re.search(pattern, file):
continue
if file[file.rfind(".")-6:file.rfind(".")].isdigit():
period = file[file.rfind(".")-6:file.rfind(".")]
else:
period = file[file.rfind(".")-7:file.rfind(".")]
if period.find("-") == -1:
period = period[1:5] + "-" + period[5:]
periods.add(period)
for file_key in files:
if fnmatch.fnmatch(file_key, key):
file_date = s3_hook.get_key(file_key, bucket).meta.data
file_date = file_date["LastModified"]
if last_date >= file_date:
continue
last_date = file_date
logger.debug(f"Fecha base desde {key} : {last_date}")
except Exception as e:
logger.error(f"Error buscando periodos disponibles en los archivos. key: {key}. {e}")
return set(periods)
logger.error(f"Error buscando archivo base para tener la fecha base. key: {key}. {e}")
return last_date
def save_df_to_s3(df: pd.DataFrame, conn: str, bucket: str, key: str, delimiter: str = ","):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment