Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UID2-4719 change azure cc starting process #1260

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/publish-azure-cc-enclave-docker.yaml
Original file line number Diff line number Diff line change
@@ -92,6 +92,7 @@ jobs:
echo "jar_version=$(mvn help:evaluate -Dexpression=project.version | grep -e '^[1-9][^\[]')" >> $GITHUB_OUTPUT
echo "git_commit=$(git show --format="%h" --no-patch)" >> $GITHUB_OUTPUT
cp -r target ${{ env.DOCKER_CONTEXT_PATH }}/
cp scripts/confidential_compute.py ${{ env.DOCKER_CONTEXT_PATH }}/

- name: Log in to the Docker container registry
uses: docker/login-action@v3
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@

<groupId>com.uid2</groupId>
<artifactId>uid2-operator</artifactId>
<version>5.44.6</version>
<version>5.45.7-alpha-166-SNAPSHOT</version>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
31 changes: 15 additions & 16 deletions scripts/aws/ec2.py
Original file line number Diff line number Diff line change
@@ -51,7 +51,8 @@ def get_meta_url(cls) -> str:
class EC2(ConfidentialCompute):

def __init__(self):
super().__init__()
self.configs: AWSConfidentialComputeConfig = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this here? self.configs {} is defined in base class



def __get_aws_token(self) -> str:
"""Fetches a temporary AWS EC2 metadata token."""
@@ -74,34 +75,32 @@ def __get_current_region(self) -> str:
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch region: {e}")

def __validate_aws_specific_config(self, secret):
if "enclave_memory_mb" in secret or "enclave_cpu_count" in secret:
def __validate_aws_specific_config(self):
if "enclave_memory_mb" in self.configs or "enclave_cpu_count" in self.configs:
max_capacity = self.__get_max_capacity()
min_capacity = {"enclave_memory_mb": 11000, "enclave_cpu_count" : 2 }
for key in ["enclave_memory_mb", "enclave_cpu_count"]:
if int(secret.get(key, 0)) > max_capacity.get(key):
raise ValueError(f"{key} value ({secret.get(key, 0)}) exceeds the maximum allowed ({max_capacity.get(key)}).")
if min_capacity.get(key) > int(secret.get(key, 10**9)):
raise ValueError(f"{key} value ({secret.get(key, 0)}) needs to be higher than the minimum required ({min_capacity.get(key)}).")
if int(self.configs.get(key, 0)) > max_capacity.get(key):
raise ValueError(f"{key} value ({self.configs.get(key, 0)}) exceeds the maximum allowed ({max_capacity.get(key)}).")
if min_capacity.get(key) > int(self.configs.get(key, 10**9)):
raise ValueError(f"{key} value ({self.configs.get(key, 0)}) needs to be higher than the minimum required ({min_capacity.get(key)}).")

def _get_secret(self, secret_identifier: str) -> AWSConfidentialComputeConfig:
def _set_secret(self, secret_identifier: str) -> None:
"""Fetches a secret value from AWS Secrets Manager and adds defaults"""

def add_defaults(configs: Dict[str, any]) -> AWSConfidentialComputeConfig:
def add_defaults(configs: Dict[str, any]) -> None:
"""Adds default values to configuration if missing."""
default_capacity = self.__get_max_capacity()
configs.setdefault("enclave_memory_mb", default_capacity["enclave_memory_mb"])
configs.setdefault("enclave_cpu_count", default_capacity["enclave_cpu_count"])
configs.setdefault("debug_mode", False)
return configs

region = self.__get_current_region()
print(f"Running in {region}")
client = boto3.client("secretsmanager", region_name=region)
try:
secret = add_defaults(json.loads(client.get_secret_value(SecretId=secret_identifier)["SecretString"]))
self.__validate_aws_specific_config(secret)
return secret
add_defaults(json.loads(client.get_secret_value(SecretId=secret_identifier)["SecretString"]))
self.__validate_aws_specific_config()
except NoCredentialsError as _:
raise MissingInstanceProfile(self.__class__.__name__)
except ClientError as _:
@@ -137,7 +136,7 @@ def __run_config_server(self) -> None:
json.dump(self.configs, config_file)
os.chdir("/opt/uid2operator/config-server")
command = ["./bin/flask", "run", "--host", AuxiliaryConfig.LOCALHOST, "--port", AuxiliaryConfig.FLASK_PORT]
self.run_command(command, seperate_process=True)
self.run_command(command, separate_process=True)

def __run_socks_proxy(self) -> None:
"""
@@ -205,12 +204,12 @@ def __run_nitro_enclave(self):
if self.configs.get('debug_mode', False):
print("Running in debug_mode")
command += ["--debug-mode", "--attach-console"]
self.run_command(command, seperate_process=True)
self.run_command(command, separate_process=True)

def run_compute(self) -> None:
"""Main execution flow for confidential compute."""
secret_manager_key = self.__get_secret_name_from_userdata()
self.configs = self._get_secret(secret_manager_key)
self._set_secret(secret_manager_key)
print(f"Fetched configs from {secret_manager_key}")
if not self.configs.get("skip_validations"):
self.validate_configuration()
37 changes: 29 additions & 8 deletions scripts/azure-cc/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
# sha from https://hub.docker.com/layers/amd64/eclipse-temurin/21.0.4_7-jre-alpine/images/sha256-8179ddc8a6c5ac9af935020628763b9a5a671e0914976715d2b61b21881cefca
# Use Alpine-based JRE image
FROM eclipse-temurin@sha256:8179ddc8a6c5ac9af935020628763b9a5a671e0914976715d2b61b21881cefca

# Install Packages
RUN apk update && apk add jq
# Install necessary packages and set up virtual environment
RUN apk update && apk add --no-cache jq python3 py3-pip && \
python3 -m venv /venv && \
. /venv/bin/activate && \
pip install --no-cache-dir requests azure-identity azure-keyvault-secrets && \
rm -rf /var/cache/apk/*

# Set virtual environment path
ENV PATH="/venv/bin:$PATH"

# Working directory
WORKDIR /app

# Expose necessary ports
EXPOSE 8080
EXPOSE 9080

# ARG and ENV variables
ARG JAR_NAME=uid2-operator
ARG JAR_VERSION=1.0.0-SNAPSHOT
ARG IMAGE_VERSION=1.0.0.unknownhash
@@ -17,18 +28,28 @@ ENV IMAGE_VERSION=${IMAGE_VERSION}
ENV REGION=default
ENV LOKI_HOSTNAME=loki

# Copy application files
COPY ./target/${JAR_NAME}-${JAR_VERSION}-jar-with-dependencies.jar /app/${JAR_NAME}-${JAR_VERSION}.jar
COPY ./target/${JAR_NAME}-${JAR_VERSION}-sources.jar /app
COPY ./target/${JAR_NAME}-${JAR_VERSION}-static.tar.gz /app/static.tar.gz
COPY ./conf/*.json /app/conf/
COPY ./conf/*.xml /app/conf/

RUN tar xzvf /app/static.tar.gz --no-same-owner --no-same-permissions && rm -f /app/static.tar.gz
# Extract and clean up tar.gz
RUN tar xzvf /app/static.tar.gz --no-same-owner --no-same-permissions && \
rm -f /app/static.tar.gz

COPY ./azureEntryPoint.py /app
COPY ./confidential_compute.py /app
RUN chmod a+x /app/*.py

COPY ./entrypoint.sh /app/
RUN chmod a+x /app/entrypoint.sh
# Create and configure non-root user
RUN adduser -D uid2-operator && \
mkdir -p /opt/uid2 && chmod 777 -R /opt/uid2 && \
chmod 705 -R /app && mkdir -p /app/file-uploads && chmod 777 -R /app/file-uploads

RUN adduser -D uid2-operator && mkdir -p /opt/uid2 && chmod 777 -R /opt/uid2 && mkdir -p /app && chmod 705 -R /app && mkdir -p /app/file-uploads && chmod 777 -R /app/file-uploads
# Switch to non-root user
USER uid2-operator

CMD ["/app/entrypoint.sh"]
# Run the Python entry point
CMD python3 /app/azureEntryPoint.py
176 changes: 176 additions & 0 deletions scripts/azure-cc/azureEntryPoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#!/usr/bin/env python3

import json
import os
import time
from typing import Dict
import sys
import shutil
import requests
import logging

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from confidential_compute import ConfidentialCompute, ConfidentialComputeConfig, MissingConfig, ConfidentialComputeStartupException
from azure.identity import DefaultAzureCredential, CredentialUnavailableError
from azure.keyvault.secrets import SecretClient
from azure.core.exceptions import ResourceNotFoundError, HttpResponseError

class AzureEntryPoint(ConfidentialCompute):

Copy link
Contributor

@abuabraham-ttd abuabraham-ttd Jan 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change this to Azure? This class name is used to create Errors/point to docs in ConfidentialComputeStartupException

kv_name = os.getenv("VAULT_NAME")
secret_name = os.getenv("OPERATOR_KEY_SECRET_NAME")
env_name = os.getenv("DEPLOYMENT_ENVIRONMENT")
jar_name = os.getenv("JAR_NAME", "default-jar-name")
jar_version = os.getenv("JAR_VERSION", "default-jar-version")

FINAL_CONFIG = "/tmp/final-config.json"

def __init__(self):
super().__init__()

def __check_env_variables(self):
if AzureEntryPoint.kv_name is None:
raise MissingConfig(self.__class__.__name__, ["VAULT_NAME"])
if AzureEntryPoint.secret_name is None:
raise MissingConfig(self.__class__.__name__, ["OPERATOR_KEY_SECRET_NAME"])
if AzureEntryPoint.env_name is None:
raise MissingConfig(self.__class__.__name__, ["DEPLOYMENT_ENVIRONMENT"])
logging.info("Env variables validation success")

def __set_environment(self):
self.configs["environment"] = AzureEntryPoint.env_name

def _set_secret(self, secret_identifier: str = None):
try:
credential = DefaultAzureCredential()
kv_URL = f"https://{AzureEntryPoint.kv_name}.vault.azure.net"
secret_client = SecretClient(vault_url=kv_URL, credential=credential)
secret = secret_client.get_secret(AzureEntryPoint.secret_name)
# print(f"Secret Value: {secret.value}")
self.configs["api_token"] = secret.value

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this only set api_token ? Don't we need ConfidentialComputeConfig to validate all user specific config

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO;

_set_secret should be setting ConfidentialComputeConfig (which contains all the possible customer inputs to create Confidential Compute env for running operators) . That's why I used _get_secret() -> ConfidentialComputeConfig

If we use _set_secret -> None, then there is no way to ensure/understand what values it sets.

except CredentialUnavailableError as auth_error:
logging.error(f"Read operator key, authentication error: {auth_error}")
raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise custom exception here? That points to public doc? This is the same as Missing Instance Profile in AWS


except ResourceNotFoundError as not_found_error:
logging.error(f"Read operator key, secret not found: {AzureEntryPoint.secret_name}. Error: {not_found_error}")
raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise custom exception here? That points to public doc? ConfigNotFound


except HttpResponseError as http_error:
logging.error(f"Read operator key, HTTP error occurred: {http_error}")
raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When would we have this? Is it tied to vnet creation? If Container Group does not have n/w configured? If so, another custom exception for Azure that points to doc, and explaining what is wrong

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically the goal is;

Every issue we know, should be caught as Custom Exception that points to public documentation on how to fix it.

Things we don' know, will be a regular exception that mentions how to/contact us.


except Exception as e:
logging.error(f"Read operator key, an unexpected error occurred: {e}")
raise

def __create_final_config(self):
TARGET_CONFIG = f"/app/conf/{AzureEntryPoint.env_name}-uid2-config.json"
if not os.path.isfile(TARGET_CONFIG):
logging.error(f"Unrecognized config {TARGET_CONFIG}")
sys.exit(1)

logging.info(f"-- copying {TARGET_CONFIG} to {AzureEntryPoint.FINAL_CONFIG}")
try:
shutil.copy(TARGET_CONFIG, AzureEntryPoint.FINAL_CONFIG)
except IOError as e:
logging.error(f"Failed to create {AzureEntryPoint.FINAL_CONFIG} with error: {e}")
sys.exit(1)

CORE_BASE_URL = os.getenv("CORE_BASE_URL")
OPTOUT_BASE_URL = os.getenv("OPTOUT_BASE_URL")
if CORE_BASE_URL and OPTOUT_BASE_URL and AzureEntryPoint.env_name != 'prod':
logging.info(f"-- replacing URLs by {CORE_BASE_URL} and {OPTOUT_BASE_URL}")
with open(AzureEntryPoint.FINAL_CONFIG, "r") as file:
config = file.read()

config = config.replace("https://core-integ.uidapi.com", CORE_BASE_URL)
config = config.replace("https://optout-integ.uidapi.com", OPTOUT_BASE_URL)

with open(AzureEntryPoint.FINAL_CONFIG, "w") as file:
file.write(config)

with open(AzureEntryPoint.FINAL_CONFIG, "r") as file:
logging.info(file.read())

def __set_baseurls(self):
with open(AzureEntryPoint.FINAL_CONFIG, "r") as file:
jdata = json.load(file)
self.configs["core_base_url"] = jdata["core_attest_url"]
self.configs["optout_base_url"] = jdata["optout_api_uri"]

def __run_operator(self):

# Start the operator
os.environ["azure_vault_name"] = AzureEntryPoint.kv_name
os.environ["azure_secret_name"] = AzureEntryPoint.secret_name

java_command = [
"java",
"-XX:MaxRAMPercentage=95", "-XX:-UseCompressedOops", "-XX:+PrintFlagsFinal",
"-Djava.security.egd=file:/dev/./urandom",
"-Dvertx.logger-delegate-factory-class-name=io.vertx.core.logging.SLF4JLogDelegateFactory",
"-Dlogback.configurationFile=/app/conf/logback.xml",
f"-Dvertx-config-path={AzureEntryPoint.FINAL_CONFIG}",
"-jar",
f"{AzureEntryPoint.jar_name}-{AzureEntryPoint.jar_version}.jar"
]
logging.info("-- starting java operator application")
self.run_command(java_command, separate_process=False)

def __wait_for_sidecar(self):
logging.info("Waiting for sidecar ...")

url = "http://169.254.169.254/ping"
delay = 1
max_retries = 15

while True:
try:
response = requests.get(url, timeout=5)
if response.status_code in [200, 204]:
logging.info("Sidecar started")
return
else:
error_msg = f"Unexpected status code: {response.status_code}, response: {response.text}"
raise Exception(error_msg)
except Exception as e:
if delay > max_retries:
logging.error(f"Sidecar failed to start after {delay} retries with error {e}", exc_info=True)
sys.exit(1)
logging.info(f"Sidecar not started. Retrying in {delay} seconds... {e}")
time.sleep(delay)
delay += 1

def run_compute(self) -> None:
"""Main execution flow for confidential compute."""
self.__check_env_variables()
self._set_secret()
self.__set_environment()
self.__create_final_config()
self.__set_baseurls()
if not self.configs.get("skip_validations"):
self.validate_configuration()
self.__wait_for_sidecar()
self.__run_operator()

def _setup_auxiliaries(self) -> None:
""" Sets up auxiliary processes required for confidential computing. """
pass

def _validate_auxiliaries(self) -> None:
""" Validates auxiliary services are running."""
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't __wait_for_sidecar be here?


if __name__ == "__main__":

logging.basicConfig(level=logging.INFO)
logging.info("Start AzureEntryPoint")
try:
operator = AzureEntryPoint()
operator.run_compute()
except ConfidentialComputeStartupException as e:
logging.error(f"Failed starting up Azure Confidential Compute. Please checks the logs for errors and retry {e}", exc_info=True)
except Exception as e:
logging.error(f"Unexpected failure while starting up Azure Confidential Compute. Please contact UID support team with this log {e}", exc_info=True)
Loading