Commit 0e214512 authored by Cristian Aguirre's avatar Cristian Aguirre

Merge branch 'developer-ca' into 'developer'

Developer ca

See merge request !1
parents e6dac30d d5c367db
env
\ No newline at end of file
general:
s3_parameters:
s3_conn_id: "bcom_tp_connection"
bucket: "prueba1234568"
dags:
dag1:
schedule: "@once"
period_pattern: '[a-zA-Z0-9]+([0-9]{4})(\-[0-9]{2})?\.[a-zA-Z]*'
csv_delimiter: ","
filters:
fields_omited: ["BCOM_ROW_IDENTIFIER", "BCOM_PROCESS_ID", "BCOM_FIELD_KEY", "BCOM_LNUMBER", "BCOM_ERROR_CODE",
"BCOM_ERROR_MESSAGE"]
tacom_drop_nulls_subset: [ "CD_FOLIO", "CD_CUENTA", "CD_PAQUETE", "TP_SERVICIO"]
promo_drop_nulls_subset: ["CUENTA", "NOMBRE_PRODUCTO", "POID_PRODUCT", "CD_PAQUETE"]
catalog_drop_nulls_subset: ["NOMBRE_PRODUCTO", "CD_PAQUETE"]
rela3pa2p_drop_nulls_subset: ["TRESP", "DOSP"]
relapoid_drop_nulls_subset: ["POID_PRODUCT", "CD_PAQUETE"]
relapaqs_drop_nulls_subset: ["COD_PAQ_INI", "COD_PAQ_FIN"]
not_promo_drop_nulls_subset: ["CD_PAQUETE"]
s3_parameters:
inputs:
prefix: "pruebas_qa"
tacom_pattern: "tacomventas_original*.txt"
promociones_pattern: "promociones_original*.txt"
outputs:
prefix: "prueba3/tacom_outputs"
tacom_output: "tacom_modified.csv"
tacom_delimiter: ","
promo_output: "promociones_modified.csv"
promo_delimiter: ","
procesed_prefix: "prueba3/procesed"
catalogo_promociones:
type: "INSUMO"
pattern: "catalogopromocion*.txt"
prefix: "pruebas_qa"
key_field: "NOMBRE_PRODUCTO"
value_field: "CD_PAQUETE"
delimiter: ","
relacion3pa2p:
type: "INSUMO"
pattern: "temporal_relacion3pa2p*.txt"
prefix: "pruebas_qa"
key_field: "TRESP"
value_field: "DOSP"
delimiter: ","
relacionpoidpaquete:
type: "INSUMO"
pattern: "temporal_relacion_Paquete*.txt"
prefix: "pruebas_qa"
key_field: "POID_PRODUCT"
value_field: "CD_PAQUETE"
delimiter: ","
relacion_paquetes:
type: "INSUMO"
pattern: "PAQUINIFIN*.txt"
prefix: ""
key_field: "COD_PAQ_INI"
value_field: "COD_PAQ_FIN"
delimiter: ","
no_promocion:
type: "INSUMO"
pattern: "PAQUETE*.txt"
prefix: ""
key_field: "CD_PAQUETE"
value_field: ""
delimiter: ","
import fnmatch
import datetime
from typing import Any, Dict, Set
import pytz
import re
from io import BytesIO, StringIO
import pandas as pd
from components.Utils import get_type_file
from enums.FileTypeEnum import FileTypeEnum
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
import logging
logger = logging.getLogger()
def get_df_from_s3(conn: str, bucket: str, key: str, period: str, delimiter: str) -> Dict[str, Any]:
response = {'filename': "", 'df': None}
dataframe = None
try:
s3_data = get_data_from_s3(conn, bucket, key, period)
logger.info(f"ARCHIVO EXTRAIDO: {s3_data}")
if s3_data["filename"] == "":
raise Exception(f"No se encontró archivo para el key: {key} y periodo {period}")
response.update({'filename': s3_data["filename"]})
file_type = get_type_file(s3_data["filename"])
if file_type == FileTypeEnum.EXCEL:
dataframe = pd.read_excel(s3_data["data"], engine="openpyxl")
elif file_type == FileTypeEnum.OLD_EXCEL:
dataframe = pd.read_excel(s3_data["data"], engine="xlrd")
elif file_type == FileTypeEnum.TEXT or file_type == FileTypeEnum.CSV:
str_data = str(s3_data["data"].getvalue(), encoding='UTF-8', errors='ignore')
data = StringIO(str_data)
dataframe = pd.read_csv(data, sep=delimiter)
response.update({'df': dataframe})
except Exception as e:
logger.error(f"Error trayendo y transformando a DataFrame desde S3 con periodo {period}. {e}")
return response
def get_data_from_s3(conn: str, bucket: str, key: str, period: str) -> Dict[str, Any]:
result = {'filename': '', 'data': BytesIO()}
utc = pytz.UTC
try:
if key.rfind("/") != -1:
prefix = key[:key.rfind("/")+1]
else:
prefix = ""
s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix)
# Colocar una fecha muy atrás como base
last_key = ("", datetime.datetime(2000, 1, 1, 0, 0, 0).replace(tzinfo=utc))
for file_key in files:
if fnmatch.fnmatch(file_key, key) and (file_key.find(period) != -1 or file_key.find(period.replace("-", "")) != -1):
file_date = s3_hook.get_key(file_key, bucket).meta.data
file_date = file_date["LastModified"]
if last_key[1] >= file_date:
continue
last_key = (file_key, file_date)
data = s3_hook.get_key(last_key[0], bucket)
data.download_fileobj(result["data"])
result["filename"] = last_key[0]
except Exception as e:
logger.error(f"Error trayendo datos desde S3 para el key {key} y periodo {period}. {e}")
return result
def search_periods_from_key_s3(conn: str, bucket: str, key: str, pattern: str) -> Set[str]:
periods = set()
try:
if key.rfind("/") != -1:
prefix = key[:key.rfind("/") + 1]
else:
prefix = ""
s3_hook = S3Hook(conn)
files = s3_hook.list_keys(bucket, prefix)
for file in files:
if not re.search(pattern, file):
continue
period = file[file.rfind(".")-7:file.rfind(".")]
if period.find("-") == -1:
period = period[1:5] + "-" + period[5:]
periods.add(period)
except Exception as e:
logger.error(f"Error buscando periodos disponibles en los archivos. key: {key}. {e}")
return set(periods)
def save_df_to_s3(df: pd.DataFrame, conn: str, bucket: str, key: str, delimiter: str = ","):
try:
logger.info(f"SUBIENDO A NUBE KEY {key}")
file_type = get_type_file(key)
s3_hook = S3Hook(conn)
if file_type == FileTypeEnum.EXCEL or file_type == FileTypeEnum.OLD_EXCEL:
with BytesIO() as buffer:
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
df.to_excel(writer, index=None)
s3_hook.load_bytes(buffer.getvalue(), key, bucket, True)
elif file_type == FileTypeEnum.CSV or file_type == FileTypeEnum.TEXT:
csv_buffer = BytesIO()
df.to_csv(csv_buffer, header=True, index=False, sep=delimiter, na_rep='None')
csv_buffer.seek(0)
s3_hook.load_bytes(csv_buffer.getvalue(), key, bucket, True)
except Exception as e:
logger.error(f"Error guardando archivos a S3. key: {key}. {e}")
def move_object_s3(conn: str, bucket: str, source_key: str, output_key: str):
try:
filename = source_key[source_key.rfind("/")+1:]
output_key += filename
s3_hook = S3Hook(conn)
s3_hook.copy_object(source_key, output_key, bucket, bucket)
s3_hook.delete_objects(bucket, source_key)
except Exception as e:
logger.error(f"Error moviendo archivo desde {source_key} hacia {output_key} en bucket {bucket}. {e}")
from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
import logging
logger = logging.getLogger()
POKE_INTERVAL = 5
TIMEOUT = 60*1
def create_s3_sensor(task_id: str, connection: str, bucket: str, key: str) -> S3KeySensor:
s3_sensor = None
try:
s3_sensor = S3KeySensor(
task_id=task_id,
bucket_key=key,
bucket_name=bucket,
wildcard_match=True,
aws_conn_id=connection,
verify=True,
poke_interval=POKE_INTERVAL,
timeout=TIMEOUT
)
except Exception as e:
logger.error(f"Error creando Sensor S3. {e}")
return s3_sensor
from typing import List, Any, Dict
import pandas as pd
from enums.CatalogConfigurationEnum import CatalogConfigurationEnum
from enums.FileTypeEnum import FileTypeEnum
import logging
logger = logging.getLogger()
def get_type_file(key: str) -> FileTypeEnum:
result = FileTypeEnum.EXCEL
try:
file_type_sufix = key.rfind(".")
file_type = key[file_type_sufix+1:]
result = FileTypeEnum(file_type)
except Exception as e:
logger.error(f"Error obteniedo el tipo de archivo de {key}. {e}")
return result
def get_modified_prefix(prefix: str) -> str:
try:
if prefix == "/":
prefix = ""
elif not prefix.endswith("/") and prefix != "":
prefix += "/"
except Exception as e:
logger.error(f"Error modificando prefijo de {prefix}. {e}")
finally:
return prefix
def add_period_to_sufix(name: str, period: str) -> str:
result = name
try:
position = name.rfind(".")
result = name[:position] + period + name[position:]
except Exception as e:
logger.error(f"Error añadiendo periodo al nombre del archivo {name}. {e}")
return result
def remove_invalid_rows(df: pd.DataFrame, valid_cols: List[str]) -> pd.DataFrame:
try:
df = df.dropna(how='all', subset=valid_cols)
except Exception as e:
logger.error(f"Error removiendo filas inválidas. {e}")
finally:
return df
def remove_fields(df: pd.DataFrame, fields_omitted: List[str]) -> pd.DataFrame:
try:
if len(fields_omitted) > 0:
df = df.loc[:, ~df.columns.isin(fields_omitted)]
except Exception as e:
logger.error(f"Error removiendo columnas. {e}")
finally:
return df
def update_dict_with_catalogs(data_dict: Dict[str, Any], data: Dict[str, Any], catalog_name: str,
default_prefix: str) -> Dict[str, Any]:
try:
catalog = data[catalog_name]
catalog_type = catalog["type"]
catalog_prefix = catalog["prefix"]
if catalog_type == CatalogConfigurationEnum.CATALOGO.value:
catalog_prefix = get_modified_prefix(catalog_prefix)
else:
catalog_prefix = default_prefix
s3_catalog = catalog_prefix + catalog["pattern"]
data_dict.update({'s3_'+catalog_name: s3_catalog, catalog_name+'_key': catalog["key_field"],
catalog_name+'_value': catalog["value_field"]})
if "delimiter" in catalog.keys():
data_dict.update({catalog_name+'_delimiter': catalog["delimiter"]})
except Exception as e:
logger.error(f"Error actualizando dict de catalogos. {e}")
finally:
return data_dict
This diff is collapsed.
from enum import Enum
class CatalogConfigurationEnum(Enum):
CATALOGO = "CATALOGO"
INSUMO = "INSUMO"
from enum import Enum
class FileTypeEnum(Enum):
TEXT = "txt"
CSV = "csv"
EXCEL = "xlsx"
OLD_EXCEL = "xls"
FROM apache/airflow:2.5.3
COPY requirements.txt /
RUN pip install --no-cache-dir "apache-airflow==2.5.3" "apache-airflow[kubernetes]==2.5.3" -r /requirements.txt
\ No newline at end of file
apiVersion: v1
kind: ConfigMap
metadata:
name: airflow-envvars-configmap
data:
# The conf below is necessary because of a typo in the config on docker-airflow image:
# https://github.com/puckel/docker-airflow/blob/bed777970caa3e555ef618d84be07404438c27e3/config/airflow.cfg#L934
AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL: '30'
AIRFLOW__LOGGING__LOGGING_LEVEL: INFO
AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE: America/Lima
AIRFLOW__CORE__DEFAULT_TIMEZONE: America/Lima
AIRFLOW__KUBERNETES__KUBE_CLIENT_REQUEST_ARGS: '{"_request_timeout": [60,60]}'
AIRFLOW__KUBERNETES__WORKER_CONTAINER_REPOSITORY: cristianfernando/airflow_custom
AIRFLOW__KUBERNETES__WORKER_CONTAINER_TAG: "0.0.1"
AIRFLOW__KUBERNETES__DAGS_VOLUME_HOST: /mnt/airflow/dags
AIRFLOW__KUBERNETES__LOGS_VOLUME_CLAIM: airflow-logs-pvc
AIRFLOW__KUBERNETES__ENV_FROM_CONFIGMAP_REF: airflow-envvars-configmap
AIRFLOW__KUBERNETES__POD_TEMPLATE_FILE: /opt/airflow/templates/pod_template.yaml
AIRFLOW__CORE__EXECUTOR: KubernetesExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
_AIRFLOW_DB_UPGRADE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'true'
_AIRFLOW_WWW_USER_USERNAME: admin
_AIRFLOW_WWW_USER_PASSWORD: admin
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pods-permissions
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch", "create", "delete"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pods-permissions
subjects:
- kind: ServiceAccount
name: default
namespace: default
roleRef:
kind: ClusterRole
name: pods-permissions
apiGroup: rbac.authorization.k8s.io
apiVersion: apps/v1
kind: Deployment
metadata:
name: airflow-scheduler
labels:
app: airflow-k8s
spec:
selector:
matchLabels:
app: airflow-scheduler
replicas: 1
template:
metadata:
labels:
app: airflow-scheduler
spec:
containers:
- name: airflow-scheduler
image: cristianfernando/airflow_custom:0.0.1
args: ["scheduler"]
envFrom:
- configMapRef:
name: airflow-envvars-configmap
resources:
limits:
memory: "512Mi"
# cpu: "100"
volumeMounts:
- name: dags-host-volume
mountPath: /opt/airflow/dags
- name: logs-persistent-storage
mountPath: /opt/airflow/logs
- name: pods-templates
mountPath: /opt/airflow/templates
volumes:
- name: dags-host-volume
hostPath:
path: /opt/airflow/dags/dags/
type: Directory
- name: pods-templates
hostPath:
path: /opt/airflow/templates/
type: Directory
- name: logs-persistent-storage
persistentVolumeClaim:
claimName: airflow-logs-pvc
apiVersion: apps/v1
kind: Deployment
metadata:
name: airflow-webserver
labels:
app: airflow-k8s
spec:
selector:
matchLabels:
app: airflow-webserver
replicas: 1
template:
metadata:
labels:
app: airflow-webserver
spec:
containers:
- name: airflow-webserver
image: cristianfernando/airflow_custom:0.0.1
args: ["webserver"]
envFrom:
- configMapRef:
name: airflow-envvars-configmap
resources:
limits:
memory: "512Mi"
# cpu: "100"
ports:
- containerPort: 8080
volumeMounts:
- name: dags-host-volume
mountPath: /opt/airflow/dags/
- name: logs-persistent-storage
mountPath: /opt/airflow/logs
volumes:
- name: dags-host-volume
hostPath:
path: /opt/airflow/dags/dags/
type: Directory
- name: logs-persistent-storage
persistentVolumeClaim:
claimName: airflow-logs-pvc
apiVersion: v1
kind: Service
metadata:
name: airflow-webserver
labels:
app: airflow-k8s
spec:
type: NodePort
selector:
app: airflow-webserver
ports:
- name: web
protocol: TCP
port: 8081
targetPort: 8080
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: airflow-logs-pvc
labels:
app: airflow-k8s
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 2Gi
storageClassName: standard
apiVersion: v1
kind: Pod
metadata:
name: dummy-name
spec:
containers:
- args: [ ]
command: [ ]
env:
- name: AIRFLOW__CORE__EXECUTOR
value: LocalExecutor
- name: DB_HOST
value: postgres
- name: DB_DATABASE
value: airflow
- name: DB_USER
value: airflow
- name: DB_PASSWORD
value: airflow
- name: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN
value: postgresql+psycopg2://airflow:airflow@postgres/airflow
- name: AIRFLOW__LOGGING__LOGGING_LEVEL
value: INFO
image: dumy-image
imagePullPolicy: IfNotPresent
name: base
volumeMounts:
- name: dags-host-volume
mountPath: /opt/airflow/dags
- name: logs-persistent-storage
mountPath: /opt/airflow/logs
hostNetwork: false
restartPolicy: Never
securityContext:
runAsUser: 50000
nodeSelector: { }
affinity: { }
tolerations: [ ]
volumes:
- name: dags-host-volume
hostPath:
path: /opt/airflow/dags/dags/
type: Directory
- name: logs-persistent-storage
persistentVolumeClaim:
claimName: airflow-logs-pvc
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres
spec:
selector:
matchLabels:
app: postgres
replicas: 1
template:
metadata:
labels:
app: postgres
spec:
containers:
- name: postgres
image: postgres:12
resources:
limits:
memory: 128Mi
cpu: 500m
ports:
- containerPort: 5432
env:
- name: POSTGRES_PASSWORD
value: airflow
- name: POSTGRES_USER
value: airflow
- name: POSTGRES_DB
value: airflow
\ No newline at end of file
apiVersion: v1
kind: Service
metadata:
name: postgres
spec:
selector:
app: postgres
ports:
- port: 5432
targetPort: 5432
apache-airflow[kubernetes]==2.5.3
openpyxl==3.1.2
XlsxWriter==3.1.2
kubectl apply -f logs-persistenvolumeclaim.yaml
kubectl apply -f airflow-rbac.yaml
kubectl apply -f postgres-deployment.yaml
kubectl apply -f postgres-service.yaml
kubectl apply -f airflow-envvars-configmap.yaml
kubectl apply -f airflow-webserver-deployment.yaml
kubectl apply -f airflow-webserver-service.yaml
kubectl apply -f airflow-scheduler-deployment.yaml
kubectl apply -f sync-dags-deployment.yaml
kubectl delete -f airflow-rbac.yaml
kubectl delete -f postgres-service.yaml
kubectl delete -f postgres-deployment.yaml
kubectl delete -f airflow-envvars-configmap.yaml
kubectl delete -f airflow-webserver-service.yaml
kubectl delete -f airflow-webserver-deployment.yaml
kubectl delete -f airflow-scheduler-deployment.yaml
kubectl delete -f logs-persistenvolumeclaim.yaml
kubectl delete -f sync-dags-deployment.yaml
\ No newline at end of file
apiVersion: apps/v1
kind: Deployment
metadata:
name: airflow-sync-dags
spec:
selector:
matchLabels:
app: airflow-sync-dags
template:
metadata:
labels:
app: airflow-sync-dags
spec:
containers:
- args:
- while true; aws s3 sync --exact-timestamps --delete 's3://prueba1234568/dags' '/dags'; do sleep 30; done;
command:
- /bin/bash
- -c
- --
name: sync-dags
image: amazon/aws-cli:2.1.34
env:
- name: AWS_ACCESS_KEY_ID
value: AKIAQAAMXO3Z4BHNKEIE
- name: AWS_SECRET_ACCESS_KEY
value: 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
volumeMounts:
- name: dags-host-volume
mountPath: /dags
volumes:
- name: dags-host-volume
hostPath:
path: /opt/airflow/dags/dags/
type: Directory
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment