Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hyperparams in stack report. #114

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions f8a_report/report_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ def __init__(self):
'PYPI_TRAINING_REPO', 'https://github.com/fabric8-analytics/f8a-pypi-insights')

self.emr_api = os.getenv('EMR_API', 'http://f8a-emr-deployment:6006')
self.maven_hyperparams = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the use of same default values as in for maven and pypi?

"lower_limit": 13,
"upper_limit": 15,
"latent_factor": 300
}
self.pypi_hyperparams = {
"lower_limit": 1,
"upper_limit": 415,
"latent_factor": 40
}

def validate_and_process_date(self, some_date):
"""Validate the date format and apply the format YYYY-MM-DDTHH:MI:SSZ."""
Expand Down Expand Up @@ -174,19 +184,20 @@ def collate_raw_data(self, unique_stacks_with_recurrence_count, frequency):
result[eco] = {"bigquery_data": collated_big_query_data.get(eco)}
return result

def invoke_emr_api(self, bucket_name, ecosystem, data_version, github_repo):
def invoke_emr_api(self, bucket_name, ecosystem, data_version, github_repo, hyperparams):
"""Invoke EMR Retraining API to start the retraining process."""
payload = {
'bucket_name': bucket_name,
'github_repo': github_repo,
'ecosystem': ecosystem,
'data_version': data_version
'data_version': data_version,
'hyperparams': hyperparams
}

logger.info('bucket_name for {eco}: {buck}'.format(eco=ecosystem, buck=bucket_name))
logger.info('github_repo for {eco}: {git}'.format(eco=ecosystem, git=github_repo))
logger.info('data_version for {eco}: {data}'.format(eco=ecosystem, data=data_version))

logger.info('Hyperparametrs for {eco}: {data}'.format(eco=ecosystem, data=hyperparams))
try:
# Invoke EMR API to run the retraining
resp = requests.post(url=self.emr_api + '/api/v1/runjob', json=payload)
Expand Down Expand Up @@ -237,10 +248,12 @@ def store_training_data(self, result):
if eco == 'maven':
bucket_name = self.maven_model_bucket
github_repo = self.maven_training_repo
hyperparams = self.maven_hyperparams
logger.info('maven bucket name is: {bucket}'.format(bucket=bucket_name))
elif eco == 'pypi':
bucket_name = self.pypi_model_bucket
github_repo = self.pypi_training_repo
hyperparams = self.pypi_hyperparams
logger.info('pypi bucket name is: {bucket}'.format(bucket=bucket_name))
elif eco == 'go':
bucket_name = self.golang_model_bucket
Expand All @@ -263,7 +276,7 @@ def store_training_data(self, result):
# Invoke the EMR API to kickstart retraining process
# This EMR invocation happens for all ecosystems almost at the same time.
# TODO - find an alternative if there is a need
self.invoke_emr_api(bucket_name, eco, model_version, github_repo)
self.invoke_emr_api(bucket_name, eco, model_version, github_repo, hyperparams)
except Exception as e:
logger.error('Unable to invoke EMR API. Reason: %r' % e)

Expand Down
12 changes: 10 additions & 2 deletions tests/test_stack_report_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,15 +332,23 @@ def json(self):
@mock.patch('requests.post', side_effect=mock_emr_api)
def test_invoke_emr_api_success(_mock):
"""Test invoke emr api function with status 200."""
result = r.invoke_emr_api('test-bucket', 'maven', '2019-01-03', 'http://github.com/test/test')
result = r.invoke_emr_api('test-bucket', 'maven', '2019-01-03', 'http://github.com/test/test', {
"lower_limit": 13,
"upper_limit": 15,
"latent_factor": 300
})

assert result is None


@mock.patch('requests.post', side_effect=mock_emr_api_fail)
def test_invoke_emr_api_failure(_mock):
"""Test invoke emr api with status 400."""
result = r.invoke_emr_api('test-bucket', 'maven', '2019-01-03', 'http://github.com/test/test')
result = r.invoke_emr_api('test-bucket', 'maven', '2019-01-03', 'http://github.com/test/test', {
"lower_limit": 13,
"upper_limit": 15,
"latent_factor": 300
})

assert result is None

Expand Down