Update action-exclude-records-v1-emr

7086b4fa · Cristian Aguirre · 9213ca48 · 7086b4fa · 7086b4fa · 7086b4fa
Commit 7086b4fa authored May 08, 2024 by Cristian Aguirre
8 changed files
--- a/app/main/engine/action/ActionInterface.py
+++ b/app/main/engine/action/ActionInterface.py
@@ -14,7 +14,7 @@ class ActionInterface(ABC):
        raise NotImplementedError

    @abstractmethod
-    def process(self, source_obj):
+    def process(self, source_obj, script_name, timezone, pattern):
        """Método que ejecuta la lógica del script"""
        raise NotImplementedError


--- a/app/main/engine/database/Mysql.py
+++ b/app/main/engine/database/Mysql.py
@@ -28,7 +28,7 @@ class Mysql:
    def create_spark_connection(self):
        params = {}
        try:
-            url = "jdbc:mysql://"+self.host+":"+str(self.port)+"/"+self.database
+            url = "jdbc:mysql://"+self.user+":"+self.password+"@"+self.host+":"+str(self.port)+"/"+self.database
            properties = {"user": self.user, "password": self.password, "driver": "com.mysql.cj.jdbc.Driver"}
            params["url"] = url
            params["properties"] = properties

--- a/app/main/engine/service/Process.py
+++ b/app/main/engine/service/Process.py
 from typing import Dict, Any
-import time
 import traceback as traceback_lib
 import importlib

@@ -26,7 +25,7 @@ class Process:
            db_params = cfg.db_params
            source = Database(self.app, db_params)
            db_session = source.get_session()
-
+            print("1")
            # Obteniendo el nombre del script
            script_name = source.get_action_by_identifier(self.descriptor["idScript"], db_session)
            if isinstance(script_name, type(None)):
@@ -47,16 +46,16 @@ class Process:

            # Iniciando process
            self.app.logger.info(f"Iniciando procesamiento de script")
-            obj_script.process(source)
-
+            obj_script.process(source, script_name, cfg.timezone, cfg.time_pattern)
+            print("1")
            # Guardando resultado
            self.app.logger.info(f"Generado y guardando resultado")
-            response = obj_script.response()
+            # _ = obj_script.response()
            # response.show()
-            result = self.utils.create_result(response, self.descriptor)
-            save = self.utils.save_result(result, self.descriptor, db_session)
-            if save["status"] == StatusEnum.ERROR.name:
-                raise InterruptedError(save["message"])
+            # result = self.utils.create_result(response, self.descriptor)
+            # save = self.utils.save_result(result, self.descriptor, db_session)
+            # if save["status"] == StatusEnum.ERROR.name:
+            #     raise InterruptedError(save["message"])
        except TimeoutError as e:
            self.app.logger.error(f"Error de Timeout. Error: {e}")
            status, status_description = CodeResponseEnum.TIMEOUT, str(e)

--- a/app/main/engine/util/EMRServerless.py
+++ b/app/main/engine/util/EMRServerless.py
+import gzip
+from typing import Dict, Any
+
+import boto3
+
+
+class EMRServerless:
+    """
+    An example implementation of running a PySpark job on EMR Serverless.
+
+    This class provides support for creating an EMR Serverless Spark application, running a job,
+    fetching driver logs, and shutting the application back down.
+
+    By default, all calls are synchronous in that they wait for the Application to reach the desired state.
+    - `create_application` waits for the application to reach the `CREATED` state.
+    - `start_application` waits for the `STARTED` state.
+    - `stop_application` waits for the `STOPPED state.
+    - `run_spark_job` waits until the job is in a terminal state.
+    """
+    def __init__(self, application_id: str = None, search_app: bool = False) -> None:
+        self.application_id = application_id
+        self.s3_log_prefix = "emr-serverless-logs"
+        self.app_type = "SPARK"  # EMR Serverless also supports jobs of type 'HIVE'
+        self.client = boto3.client("emr-serverless")
+        self.search_app = search_app
+
+    def __str__(self):
+        return f"EMR Serverless {self.app_type} Application: {self.application_id}"
+
+    def valid_application(self) -> Dict[str, Any]:
+        """
+        Valid if an application is created or started and get it
+        :return:
+        """
+        response = {"exists": False}
+        if self.search_app:
+            applications = self.client.list_applications()["applications"]
+            print(applications)
+            if len(applications) > 0:
+                response["exists"] = True
+                application = applications[0]
+                application = application["id"]
+                response["app"] = application
+        return response
+
+    def create_application(self, name: str, release_label: str, args: dict, wait: bool = True):
+        """
+        Create a new application with the provided name and release_label - the application needs to be started after.
+        """
+        if self.application_id is not None:
+            raise Exception(
+                f"Application already created (application_id: `{self.application_id}`)"
+            )
+        initial_capacity = args["initial_capacity"]
+        maximum_capacity = args["maximun_capacity"]
+        networkConfiguration = args["networkConfiguration"]
+        imageConfiguration = args["imageConfiguration"]
+        response = self.client.create_application(
+            name=name, releaseLabel=release_label, type=self.app_type,
+            initialCapacity=initial_capacity, maximumCapacity=maximum_capacity,
+            networkConfiguration=networkConfiguration, imageConfiguration=imageConfiguration
+        )
+        self.application_id = response.get("applicationId")
+
+        app_ready = False
+        while wait and not app_ready:
+            response = self.client.get_application(applicationId=self.application_id)
+            app_ready = response.get("application").get("state") == "CREATED"
+
+    def start_application(self, wait: bool = True) -> None:
+        """
+        Start the application - by default, wait until the application is started.
+        """
+        if self.application_id is None:
+            raise Exception(
+                "No application_id - please use creation_application first."
+            )
+
+        self.client.start_application(applicationId=self.application_id)
+
+        app_started = False
+        while wait and not app_started:
+            response = self.client.get_application(applicationId=self.application_id)
+            app_started = response.get("application").get("state") == "STARTED"
+
+    def stop_application(self, wait: bool = True) -> None:
+        """
+        Stop the application - by default, wait until the application is stopped.
+        """
+        self.client.stop_application(applicationId=self.application_id)
+
+        app_stopped = False
+        while wait and not app_stopped:
+            response = self.client.get_application(applicationId=self.application_id)
+            app_stopped = response.get("application").get("state") == "STOPPED"
+
+    def delete_application(self) -> None:
+        """
+        Delete the application - it must be stopped first.
+        """
+        self.client.delete_application(applicationId=self.application_id)
+
+    def run_spark_job(
+        self,
+        script_location: str,
+        job_role_arn: str,
+        arguments: [],
+        sparkArguments: [],
+        s3_bucket_name: str,
+        wait: bool = True,
+    ) -> str:
+        """
+        Runs the Spark job identified by `script_location`. Arguments can also be provided via the `arguments` parameter.
+
+        By default, spark-submit parameters are hard-coded and logs are sent to the provided s3_bucket_name.
+        This method is blocking by default until the job is complete.
+        """
+        spark_args = "--conf spark.driver.cores="+str(sparkArguments["driver-cores"])
+        spark_args += " --conf spark.driver.memory="+str(sparkArguments["driver-memory"])
+        spark_args += " --conf spark.executor.cores="+str(sparkArguments["executor-cores"])
+        spark_args += " --conf spark.executor.memory="+str(sparkArguments["executor-memory"])
+        spark_args += " --conf spark.executor.instances="+str(sparkArguments["executor-instances"])
+        spark_args += " " + sparkArguments["others"]
+        response = self.client.start_job_run(
+            applicationId=self.application_id,
+            executionRoleArn=job_role_arn,
+            jobDriver={
+                "sparkSubmit": {
+                    "entryPoint": script_location,
+                    "entryPointArguments": arguments,
+                    "sparkSubmitParameters": spark_args,
+                }
+            },
+            configurationOverrides={
+                "monitoringConfiguration": {
+                    "s3MonitoringConfiguration": {
+                        "logUri": f"s3://{s3_bucket_name}/{self.s3_log_prefix}"
+                    }
+                }
+            },
+        )
+        job_run_id = response.get("jobRunId")
+
+        job_done = False
+        while wait and not job_done:
+            jr_response = self.get_job_run(job_run_id)
+            job_done = jr_response.get("state") in [
+                "SUCCESS",
+                "FAILED",
+                "CANCELLING",
+                "CANCELLED",
+            ]
+
+        return job_run_id
+
+    def get_job_run(self, job_run_id: str) -> dict:
+        response = self.client.get_job_run(
+            applicationId=self.application_id, jobRunId=job_run_id
+        )
+        return response.get("jobRun")
+
+    def fetch_driver_log(
+        self, s3_bucket_name: str, job_run_id: str, log_type: str = "stdout"
+    ) -> str:
+        """
+        Access the specified `log_type` Driver log on S3 and return the full log string.
+        """
+        s3_client = boto3.client("s3")
+        file_location = f"{self.s3_log_prefix}/applications/{self.application_id}/jobs/{job_run_id}/SPARK_DRIVER/{log_type}.gz"
+        try:
+            response = s3_client.get_object(Bucket=s3_bucket_name, Key=file_location)
+            file_content = gzip.decompress(response["Body"].read()).decode("utf-8")
+        except s3_client.exceptions.NoSuchKey:
+            file_content = ""
+        return str(file_content)
--- a/deploy/Dockerfile
+++ b/deploy/Dockerfile
+FROM public.ecr.aws/emr-serverless/spark/emr-7.0.0:latest
+
+USER root
+
+# install python 3
+RUN yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make zlib-devel
+RUN wget https://www.python.org/ftp/python/3.10.0/Python-3.10.0.tgz && \
+tar xzf Python-3.10.0.tgz && cd Python-3.10.0 && \
+./configure --enable-optimizations && \
+make altinstall
+
+COPY subset_sum_linux /tmp/
+COPY requirements.txt /
+RUN python3 -m pip install numpy pandas py4j python-dateutil pytz six tzdata
+
+
+# EMRS will run the image as hadoop
+USER hadoop:hadoop
\ No newline at end of file
--- a/run.py
+++ b/run.py
@@ -6,4 +6,4 @@ base = MainApplication()
 app = base.create_app()

 if __name__ == "__main__":
-    base.run(port=8000)
+    base.run(port=7500)
--- a/scripts/emr_match-and-exclude-records-actions_v1.py
+++ b/scripts/emr_match-and-exclude-records-actions_v1.py
--- a/scripts/match-and-exclude-records-actions_v1.py
+++ b/scripts/match-and-exclude-records-actions_v1.py