Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
CSS-Engine-Python-Cusca
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Proyectos-Innovacion-2024
CSS-Engine-Python-Cusca
Commits
7086b4fa
Commit
7086b4fa
authored
May 08, 2024
by
Cristian Aguirre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update action-exclude-records-v1-emr
parent
9213ca48
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
676 additions
and
254 deletions
+676
-254
ActionInterface.py
app/main/engine/action/ActionInterface.py
+1
-1
Mysql.py
app/main/engine/database/Mysql.py
+1
-1
Process.py
app/main/engine/service/Process.py
+8
-9
EMRServerless.py
app/main/engine/util/EMRServerless.py
+175
-0
Dockerfile
deploy/Dockerfile
+18
-0
run.py
run.py
+1
-1
emr_match-and-exclude-records-actions_v1.py
scripts/emr_match-and-exclude-records-actions_v1.py
+366
-0
match-and-exclude-records-actions_v1.py
scripts/match-and-exclude-records-actions_v1.py
+106
-242
No files found.
app/main/engine/action/ActionInterface.py
View file @
7086b4fa
...
...
@@ -14,7 +14,7 @@ class ActionInterface(ABC):
raise
NotImplementedError
@
abstractmethod
def
process
(
self
,
source_obj
):
def
process
(
self
,
source_obj
,
script_name
,
timezone
,
pattern
):
"""Método que ejecuta la lógica del script"""
raise
NotImplementedError
...
...
app/main/engine/database/Mysql.py
View file @
7086b4fa
...
...
@@ -28,7 +28,7 @@ class Mysql:
def
create_spark_connection
(
self
):
params
=
{}
try
:
url
=
"jdbc:mysql://"
+
self
.
host
+
":"
+
str
(
self
.
port
)
+
"/"
+
self
.
database
url
=
"jdbc:mysql://"
+
self
.
user
+
":"
+
self
.
password
+
"@"
+
self
.
host
+
":"
+
str
(
self
.
port
)
+
"/"
+
self
.
database
properties
=
{
"user"
:
self
.
user
,
"password"
:
self
.
password
,
"driver"
:
"com.mysql.cj.jdbc.Driver"
}
params
[
"url"
]
=
url
params
[
"properties"
]
=
properties
...
...
app/main/engine/service/Process.py
View file @
7086b4fa
from
typing
import
Dict
,
Any
import
time
import
traceback
as
traceback_lib
import
importlib
...
...
@@ -26,7 +25,7 @@ class Process:
db_params
=
cfg
.
db_params
source
=
Database
(
self
.
app
,
db_params
)
db_session
=
source
.
get_session
()
print
(
"1"
)
# Obteniendo el nombre del script
script_name
=
source
.
get_action_by_identifier
(
self
.
descriptor
[
"idScript"
],
db_session
)
if
isinstance
(
script_name
,
type
(
None
)):
...
...
@@ -47,16 +46,16 @@ class Process:
# Iniciando process
self
.
app
.
logger
.
info
(
f
"Iniciando procesamiento de script"
)
obj_script
.
process
(
source
)
obj_script
.
process
(
source
,
script_name
,
cfg
.
timezone
,
cfg
.
time_pattern
)
print
(
"1"
)
# Guardando resultado
self
.
app
.
logger
.
info
(
f
"Generado y guardando resultado"
)
response
=
obj_script
.
response
()
# _
= obj_script.response()
# response.show()
result
=
self
.
utils
.
create_result
(
response
,
self
.
descriptor
)
save
=
self
.
utils
.
save_result
(
result
,
self
.
descriptor
,
db_session
)
if
save
[
"status"
]
==
StatusEnum
.
ERROR
.
name
:
raise
InterruptedError
(
save
[
"message"
])
#
result = self.utils.create_result(response, self.descriptor)
#
save = self.utils.save_result(result, self.descriptor, db_session)
#
if save["status"] == StatusEnum.ERROR.name:
#
raise InterruptedError(save["message"])
except
TimeoutError
as
e
:
self
.
app
.
logger
.
error
(
f
"Error de Timeout. Error: {e}"
)
status
,
status_description
=
CodeResponseEnum
.
TIMEOUT
,
str
(
e
)
...
...
app/main/engine/util/EMRServerless.py
0 → 100644
View file @
7086b4fa
import
gzip
from
typing
import
Dict
,
Any
import
boto3
class
EMRServerless
:
"""
An example implementation of running a PySpark job on EMR Serverless.
This class provides support for creating an EMR Serverless Spark application, running a job,
fetching driver logs, and shutting the application back down.
By default, all calls are synchronous in that they wait for the Application to reach the desired state.
- `create_application` waits for the application to reach the `CREATED` state.
- `start_application` waits for the `STARTED` state.
- `stop_application` waits for the `STOPPED state.
- `run_spark_job` waits until the job is in a terminal state.
"""
def
__init__
(
self
,
application_id
:
str
=
None
,
search_app
:
bool
=
False
)
->
None
:
self
.
application_id
=
application_id
self
.
s3_log_prefix
=
"emr-serverless-logs"
self
.
app_type
=
"SPARK"
# EMR Serverless also supports jobs of type 'HIVE'
self
.
client
=
boto3
.
client
(
"emr-serverless"
)
self
.
search_app
=
search_app
def
__str__
(
self
):
return
f
"EMR Serverless {self.app_type} Application: {self.application_id}"
def
valid_application
(
self
)
->
Dict
[
str
,
Any
]:
"""
Valid if an application is created or started and get it
:return:
"""
response
=
{
"exists"
:
False
}
if
self
.
search_app
:
applications
=
self
.
client
.
list_applications
()[
"applications"
]
print
(
applications
)
if
len
(
applications
)
>
0
:
response
[
"exists"
]
=
True
application
=
applications
[
0
]
application
=
application
[
"id"
]
response
[
"app"
]
=
application
return
response
def
create_application
(
self
,
name
:
str
,
release_label
:
str
,
args
:
dict
,
wait
:
bool
=
True
):
"""
Create a new application with the provided name and release_label - the application needs to be started after.
"""
if
self
.
application_id
is
not
None
:
raise
Exception
(
f
"Application already created (application_id: `{self.application_id}`)"
)
initial_capacity
=
args
[
"initial_capacity"
]
maximum_capacity
=
args
[
"maximun_capacity"
]
networkConfiguration
=
args
[
"networkConfiguration"
]
imageConfiguration
=
args
[
"imageConfiguration"
]
response
=
self
.
client
.
create_application
(
name
=
name
,
releaseLabel
=
release_label
,
type
=
self
.
app_type
,
initialCapacity
=
initial_capacity
,
maximumCapacity
=
maximum_capacity
,
networkConfiguration
=
networkConfiguration
,
imageConfiguration
=
imageConfiguration
)
self
.
application_id
=
response
.
get
(
"applicationId"
)
app_ready
=
False
while
wait
and
not
app_ready
:
response
=
self
.
client
.
get_application
(
applicationId
=
self
.
application_id
)
app_ready
=
response
.
get
(
"application"
)
.
get
(
"state"
)
==
"CREATED"
def
start_application
(
self
,
wait
:
bool
=
True
)
->
None
:
"""
Start the application - by default, wait until the application is started.
"""
if
self
.
application_id
is
None
:
raise
Exception
(
"No application_id - please use creation_application first."
)
self
.
client
.
start_application
(
applicationId
=
self
.
application_id
)
app_started
=
False
while
wait
and
not
app_started
:
response
=
self
.
client
.
get_application
(
applicationId
=
self
.
application_id
)
app_started
=
response
.
get
(
"application"
)
.
get
(
"state"
)
==
"STARTED"
def
stop_application
(
self
,
wait
:
bool
=
True
)
->
None
:
"""
Stop the application - by default, wait until the application is stopped.
"""
self
.
client
.
stop_application
(
applicationId
=
self
.
application_id
)
app_stopped
=
False
while
wait
and
not
app_stopped
:
response
=
self
.
client
.
get_application
(
applicationId
=
self
.
application_id
)
app_stopped
=
response
.
get
(
"application"
)
.
get
(
"state"
)
==
"STOPPED"
def
delete_application
(
self
)
->
None
:
"""
Delete the application - it must be stopped first.
"""
self
.
client
.
delete_application
(
applicationId
=
self
.
application_id
)
def
run_spark_job
(
self
,
script_location
:
str
,
job_role_arn
:
str
,
arguments
:
[],
sparkArguments
:
[],
s3_bucket_name
:
str
,
wait
:
bool
=
True
,
)
->
str
:
"""
Runs the Spark job identified by `script_location`. Arguments can also be provided via the `arguments` parameter.
By default, spark-submit parameters are hard-coded and logs are sent to the provided s3_bucket_name.
This method is blocking by default until the job is complete.
"""
spark_args
=
"--conf spark.driver.cores="
+
str
(
sparkArguments
[
"driver-cores"
])
spark_args
+=
" --conf spark.driver.memory="
+
str
(
sparkArguments
[
"driver-memory"
])
spark_args
+=
" --conf spark.executor.cores="
+
str
(
sparkArguments
[
"executor-cores"
])
spark_args
+=
" --conf spark.executor.memory="
+
str
(
sparkArguments
[
"executor-memory"
])
spark_args
+=
" --conf spark.executor.instances="
+
str
(
sparkArguments
[
"executor-instances"
])
spark_args
+=
" "
+
sparkArguments
[
"others"
]
response
=
self
.
client
.
start_job_run
(
applicationId
=
self
.
application_id
,
executionRoleArn
=
job_role_arn
,
jobDriver
=
{
"sparkSubmit"
:
{
"entryPoint"
:
script_location
,
"entryPointArguments"
:
arguments
,
"sparkSubmitParameters"
:
spark_args
,
}
},
configurationOverrides
=
{
"monitoringConfiguration"
:
{
"s3MonitoringConfiguration"
:
{
"logUri"
:
f
"s3://{s3_bucket_name}/{self.s3_log_prefix}"
}
}
},
)
job_run_id
=
response
.
get
(
"jobRunId"
)
job_done
=
False
while
wait
and
not
job_done
:
jr_response
=
self
.
get_job_run
(
job_run_id
)
job_done
=
jr_response
.
get
(
"state"
)
in
[
"SUCCESS"
,
"FAILED"
,
"CANCELLING"
,
"CANCELLED"
,
]
return
job_run_id
def
get_job_run
(
self
,
job_run_id
:
str
)
->
dict
:
response
=
self
.
client
.
get_job_run
(
applicationId
=
self
.
application_id
,
jobRunId
=
job_run_id
)
return
response
.
get
(
"jobRun"
)
def
fetch_driver_log
(
self
,
s3_bucket_name
:
str
,
job_run_id
:
str
,
log_type
:
str
=
"stdout"
)
->
str
:
"""
Access the specified `log_type` Driver log on S3 and return the full log string.
"""
s3_client
=
boto3
.
client
(
"s3"
)
file_location
=
f
"{self.s3_log_prefix}/applications/{self.application_id}/jobs/{job_run_id}/SPARK_DRIVER/{log_type}.gz"
try
:
response
=
s3_client
.
get_object
(
Bucket
=
s3_bucket_name
,
Key
=
file_location
)
file_content
=
gzip
.
decompress
(
response
[
"Body"
]
.
read
())
.
decode
(
"utf-8"
)
except
s3_client
.
exceptions
.
NoSuchKey
:
file_content
=
""
return
str
(
file_content
)
deploy/Dockerfile
0 → 100644
View file @
7086b4fa
FROM
public.ecr.aws/emr-serverless/spark/emr-7.0.0:latest
USER
root
# install python 3
RUN
yum
install
-y
gcc openssl-devel bzip2-devel libffi-devel
tar gzip
wget make zlib-devel
RUN
wget https://www.python.org/ftp/python/3.10.0/Python-3.10.0.tgz
&&
\
tar
xzf Python-3.10.0.tgz
&&
cd
Python-3.10.0
&&
\
./configure
--enable-optimizations
&&
\
make altinstall
COPY
subset_sum_linux /tmp/
COPY
requirements.txt /
RUN
python3
-m
pip
install
numpy pandas py4j python-dateutil pytz six tzdata
# EMRS will run the image as hadoop
USER
hadoop:hadoop
\ No newline at end of file
run.py
View file @
7086b4fa
...
...
@@ -6,4 +6,4 @@ base = MainApplication()
app
=
base
.
create_app
()
if
__name__
==
"__main__"
:
base
.
run
(
port
=
80
00
)
base
.
run
(
port
=
75
00
)
scripts/emr_match-and-exclude-records-actions_v1.py
0 → 100644
View file @
7086b4fa
This diff is collapsed.
Click to expand it.
scripts/match-and-exclude-records-actions_v1.py
View file @
7086b4fa
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment