Commit 7086b4fa authored by Cristian Aguirre's avatar Cristian Aguirre

Update action-exclude-records-v1-emr

parent 9213ca48
......@@ -14,7 +14,7 @@ class ActionInterface(ABC):
raise NotImplementedError
@abstractmethod
def process(self, source_obj):
def process(self, source_obj, script_name, timezone, pattern):
"""Método que ejecuta la lógica del script"""
raise NotImplementedError
......
......@@ -28,7 +28,7 @@ class Mysql:
def create_spark_connection(self):
params = {}
try:
url = "jdbc:mysql://"+self.host+":"+str(self.port)+"/"+self.database
url = "jdbc:mysql://"+self.user+":"+self.password+"@"+self.host+":"+str(self.port)+"/"+self.database
properties = {"user": self.user, "password": self.password, "driver": "com.mysql.cj.jdbc.Driver"}
params["url"] = url
params["properties"] = properties
......
from typing import Dict, Any
import time
import traceback as traceback_lib
import importlib
......@@ -26,7 +25,7 @@ class Process:
db_params = cfg.db_params
source = Database(self.app, db_params)
db_session = source.get_session()
print("1")
# Obteniendo el nombre del script
script_name = source.get_action_by_identifier(self.descriptor["idScript"], db_session)
if isinstance(script_name, type(None)):
......@@ -47,16 +46,16 @@ class Process:
# Iniciando process
self.app.logger.info(f"Iniciando procesamiento de script")
obj_script.process(source)
obj_script.process(source, script_name, cfg.timezone, cfg.time_pattern)
print("1")
# Guardando resultado
self.app.logger.info(f"Generado y guardando resultado")
response = obj_script.response()
# _ = obj_script.response()
# response.show()
result = self.utils.create_result(response, self.descriptor)
save = self.utils.save_result(result, self.descriptor, db_session)
if save["status"] == StatusEnum.ERROR.name:
raise InterruptedError(save["message"])
# result = self.utils.create_result(response, self.descriptor)
# save = self.utils.save_result(result, self.descriptor, db_session)
# if save["status"] == StatusEnum.ERROR.name:
# raise InterruptedError(save["message"])
except TimeoutError as e:
self.app.logger.error(f"Error de Timeout. Error: {e}")
status, status_description = CodeResponseEnum.TIMEOUT, str(e)
......
import gzip
from typing import Dict, Any
import boto3
class EMRServerless:
"""
An example implementation of running a PySpark job on EMR Serverless.
This class provides support for creating an EMR Serverless Spark application, running a job,
fetching driver logs, and shutting the application back down.
By default, all calls are synchronous in that they wait for the Application to reach the desired state.
- `create_application` waits for the application to reach the `CREATED` state.
- `start_application` waits for the `STARTED` state.
- `stop_application` waits for the `STOPPED state.
- `run_spark_job` waits until the job is in a terminal state.
"""
def __init__(self, application_id: str = None, search_app: bool = False) -> None:
self.application_id = application_id
self.s3_log_prefix = "emr-serverless-logs"
self.app_type = "SPARK" # EMR Serverless also supports jobs of type 'HIVE'
self.client = boto3.client("emr-serverless")
self.search_app = search_app
def __str__(self):
return f"EMR Serverless {self.app_type} Application: {self.application_id}"
def valid_application(self) -> Dict[str, Any]:
"""
Valid if an application is created or started and get it
:return:
"""
response = {"exists": False}
if self.search_app:
applications = self.client.list_applications()["applications"]
print(applications)
if len(applications) > 0:
response["exists"] = True
application = applications[0]
application = application["id"]
response["app"] = application
return response
def create_application(self, name: str, release_label: str, args: dict, wait: bool = True):
"""
Create a new application with the provided name and release_label - the application needs to be started after.
"""
if self.application_id is not None:
raise Exception(
f"Application already created (application_id: `{self.application_id}`)"
)
initial_capacity = args["initial_capacity"]
maximum_capacity = args["maximun_capacity"]
networkConfiguration = args["networkConfiguration"]
imageConfiguration = args["imageConfiguration"]
response = self.client.create_application(
name=name, releaseLabel=release_label, type=self.app_type,
initialCapacity=initial_capacity, maximumCapacity=maximum_capacity,
networkConfiguration=networkConfiguration, imageConfiguration=imageConfiguration
)
self.application_id = response.get("applicationId")
app_ready = False
while wait and not app_ready:
response = self.client.get_application(applicationId=self.application_id)
app_ready = response.get("application").get("state") == "CREATED"
def start_application(self, wait: bool = True) -> None:
"""
Start the application - by default, wait until the application is started.
"""
if self.application_id is None:
raise Exception(
"No application_id - please use creation_application first."
)
self.client.start_application(applicationId=self.application_id)
app_started = False
while wait and not app_started:
response = self.client.get_application(applicationId=self.application_id)
app_started = response.get("application").get("state") == "STARTED"
def stop_application(self, wait: bool = True) -> None:
"""
Stop the application - by default, wait until the application is stopped.
"""
self.client.stop_application(applicationId=self.application_id)
app_stopped = False
while wait and not app_stopped:
response = self.client.get_application(applicationId=self.application_id)
app_stopped = response.get("application").get("state") == "STOPPED"
def delete_application(self) -> None:
"""
Delete the application - it must be stopped first.
"""
self.client.delete_application(applicationId=self.application_id)
def run_spark_job(
self,
script_location: str,
job_role_arn: str,
arguments: [],
sparkArguments: [],
s3_bucket_name: str,
wait: bool = True,
) -> str:
"""
Runs the Spark job identified by `script_location`. Arguments can also be provided via the `arguments` parameter.
By default, spark-submit parameters are hard-coded and logs are sent to the provided s3_bucket_name.
This method is blocking by default until the job is complete.
"""
spark_args = "--conf spark.driver.cores="+str(sparkArguments["driver-cores"])
spark_args += " --conf spark.driver.memory="+str(sparkArguments["driver-memory"])
spark_args += " --conf spark.executor.cores="+str(sparkArguments["executor-cores"])
spark_args += " --conf spark.executor.memory="+str(sparkArguments["executor-memory"])
spark_args += " --conf spark.executor.instances="+str(sparkArguments["executor-instances"])
spark_args += " " + sparkArguments["others"]
response = self.client.start_job_run(
applicationId=self.application_id,
executionRoleArn=job_role_arn,
jobDriver={
"sparkSubmit": {
"entryPoint": script_location,
"entryPointArguments": arguments,
"sparkSubmitParameters": spark_args,
}
},
configurationOverrides={
"monitoringConfiguration": {
"s3MonitoringConfiguration": {
"logUri": f"s3://{s3_bucket_name}/{self.s3_log_prefix}"
}
}
},
)
job_run_id = response.get("jobRunId")
job_done = False
while wait and not job_done:
jr_response = self.get_job_run(job_run_id)
job_done = jr_response.get("state") in [
"SUCCESS",
"FAILED",
"CANCELLING",
"CANCELLED",
]
return job_run_id
def get_job_run(self, job_run_id: str) -> dict:
response = self.client.get_job_run(
applicationId=self.application_id, jobRunId=job_run_id
)
return response.get("jobRun")
def fetch_driver_log(
self, s3_bucket_name: str, job_run_id: str, log_type: str = "stdout"
) -> str:
"""
Access the specified `log_type` Driver log on S3 and return the full log string.
"""
s3_client = boto3.client("s3")
file_location = f"{self.s3_log_prefix}/applications/{self.application_id}/jobs/{job_run_id}/SPARK_DRIVER/{log_type}.gz"
try:
response = s3_client.get_object(Bucket=s3_bucket_name, Key=file_location)
file_content = gzip.decompress(response["Body"].read()).decode("utf-8")
except s3_client.exceptions.NoSuchKey:
file_content = ""
return str(file_content)
FROM public.ecr.aws/emr-serverless/spark/emr-7.0.0:latest
USER root
# install python 3
RUN yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make zlib-devel
RUN wget https://www.python.org/ftp/python/3.10.0/Python-3.10.0.tgz && \
tar xzf Python-3.10.0.tgz && cd Python-3.10.0 && \
./configure --enable-optimizations && \
make altinstall
COPY subset_sum_linux /tmp/
COPY requirements.txt /
RUN python3 -m pip install numpy pandas py4j python-dateutil pytz six tzdata
# EMRS will run the image as hadoop
USER hadoop:hadoop
\ No newline at end of file
......@@ -6,4 +6,4 @@ base = MainApplication()
app = base.create_app()
if __name__ == "__main__":
base.run(port=8000)
base.run(port=7500)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment