diff --git a/README.md b/README.md index a7598c2..95cff4c 100644 --- a/README.md +++ b/README.md @@ -43,12 +43,13 @@ The simplest approach to running the application after having built it is to use $ docker compose up ``` +> [!NOTE] +> If you change the code, in order to make the changes visible to docker compose you need to rebuild the local image +> with `docker compose build` and then `docker compose up` will take care to restart containers who have a new image available + The complete Docker Compose setup runs the following services: 1. Setup a Postgres database in a docker container and exposes it to `localhost` on port `5432` (service name `db`) -2. Initialize the database schema and creates and configures access for the users required by the API (service name `bootstrap-db`) -3. Performs a one time sync of the data available in the mainnet and testnet registries from Cardano Foundation and IOG (service name `sync-db-once`) -4. Starts a service that syncs the repository in a fixed interval (service name `sync-db-cron`) -5. Starts the actual Spring application that exposes the CIP-26 REST API and exposes it to `localhost` on port `8081` (service name `api`) +2. Starts the actual Spring application that exposes the CIP-26 REST API and exposes it to `localhost` on port `8081` (service name `api`) To test if the API is running query its health endpoint by executing: ```console @@ -57,12 +58,8 @@ $ curl http://localhost:8081/actuator/health Have a look at the [.env file](./.env) for the various configuration options. -If you ony need the database without starting the REST API or the data synced from the registries simply use the following command: -```console -$ docker compose up db bootstrap-db -``` - -At the moment the application needs a PostgreSQL database as a storage layer which might change in the future. You can use the [liquibase](https://www.liquibase.org/) database migration scripts provided in our [database folder](./database) to initialize this database. +At the moment the application needs a PostgreSQL database as a storage layer which might change in the future. Database evolutions +are managed by the `api` project using [flyway](https://flywaydb.org/). ## Features diff --git a/api/src/main/resources/application.properties b/api/src/main/resources/application.properties index d6acc9b..741dd27 100644 --- a/api/src/main/resources/application.properties +++ b/api/src/main/resources/application.properties @@ -25,5 +25,12 @@ spring.flyway.baselineOnMigrate=true spring.flyway.enabled=true spring.flyway.validateMigrationNaming=true +# Git Configuration +git.organization=${GITHUB_ORGANIZATION:cardano-foundation} +git.projectName=${GITHUB_PROJECT_NAME:cardano-token-registry} +git.mappingsFolder=${GITHUB_MAPPINGS_FOLDER:mappings} +git.tmp.folder=${GITHUB_TMP_FOLDER:/tmp} +git.forceClone=${GITHUB_FORCE_CLONE:false} + # Github Token Metadata Sync token.metadata.job.enabled=${TOKEN_METADATA_SYNC_JOB:false} diff --git a/common/src/main/java/org/cardanofoundation/tokenmetadata/registry/service/GitService.java b/common/src/main/java/org/cardanofoundation/tokenmetadata/registry/service/GitService.java index 28e7782..53db132 100644 --- a/common/src/main/java/org/cardanofoundation/tokenmetadata/registry/service/GitService.java +++ b/common/src/main/java/org/cardanofoundation/tokenmetadata/registry/service/GitService.java @@ -28,34 +28,62 @@ public class GitService { private String mappingsFolderName; @Value("${git.tmp.folder:/tmp}") private String gitTempFolder; + @Value("${git.forceClone:false}") + private boolean forceClone; public Optional cloneCardanoTokenRegistryGitRepository() { var gitFolder = getGitFolder(); - if (gitFolder.exists()) { + + boolean repoReady; + if (gitFolder.exists() && (forceClone || !isGitRepo())) { + log.info("exists and either force clone or not a git repo"); FileSystemUtils.deleteRecursively(gitFolder); + repoReady = cloneRepo(); + } else if (gitFolder.exists() && isGitRepo()) { + log.info("exists and is git repo"); + repoReady = pullRebaseRepo(); + } else { + repoReady = cloneRepo(); } - try { + if (repoReady) { + return Optional.of(getMappingsFolder()); + } else { + return Optional.empty(); + } + } + + private boolean cloneRepo() { + try { var process = new ProcessBuilder() - .directory(gitFolder.getParentFile()) + .directory(getGitFolder().getParentFile()) .command("sh", "-c", String.format("git clone https://github.com/%s/%s.git", organization, projectName)) .start(); - var exitCode = process.waitFor(); - - if (exitCode == 0) { - return Optional.of(getMappingsFolder()); - } else { - return Optional.empty(); - } - - + return exitCode == 0; } catch (Exception e) { log.warn(String.format("It was not possible to clone the %s project", projectName), e); - return Optional.empty(); + return false; } + } + + private boolean pullRebaseRepo() { + try { + var process = new ProcessBuilder() + .directory(getGitFolder()) + .command("sh", "-c", "git pull --rebase") + .start(); + var exitCode = process.waitFor(); + return exitCode == 0; + } catch (Exception e) { + log.warn("it was not possible to update repo. cloning from scratch", e); + return false; + } + } + private boolean isGitRepo() { + return getGitFolder().toPath().resolve(".git").toFile().exists(); } private File getGitFolder() { diff --git a/gitsync/.dockerignore b/gitsync/.dockerignore deleted file mode 100644 index f2d4b96..0000000 --- a/gitsync/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -__pycache__* \ No newline at end of file diff --git a/gitsync/Dockerfile.sync-cron b/gitsync/Dockerfile.sync-cron deleted file mode 100644 index 19ac36e..0000000 --- a/gitsync/Dockerfile.sync-cron +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.9-slim-bullseye - -ARG DB_URL -ARG DB_USER_SECRET -ARG DB_USER_NAME -ARG TOKEN_REGISTRY_REPOSITORY_URL -ARG TOKEN_REGISTRY_BRANCH_NAME -ARG MAPPINGS_FOLDER -ARG TESTNET_TOKEN_REGISTRY_REPOSITORY_URL -ARG TESTNET_TOKEN_REGISTRY_BRANCH_NAME -ARG TESTNET_MAPPINGS_FOLDER - -RUN apt update && apt upgrade -RUN apt install -y git cron build-essential libpq-dev -RUN git config --global pull.rebase false -WORKDIR /cf-gitsync-job/ -COPY . /cf-gitsync-job/ - -RUN echo "#!/bin/bash" >> /cf-gitsync-job/.env -RUN echo ". /etc/profile" >> /cf-gitsync-job/.env -RUN echo ". ~/.bashrc" >> /cf-gitsync-job/.env -RUN echo "DB_URL=${DB_URL}" >> /cf-gitsync-job/.env -RUN echo "DB_USER_SECRET=${DB_USER_SECRET}" >> /cf-gitsync-job/.env -RUN echo "DB_USER_NAME=${DB_USER_NAME}" >> /cf-gitsync-job/.env -RUN echo "TOKEN_REGISTRY_REPOSITORY_URL=${TOKEN_REGISTRY_REPOSITORY_URL}" >> /cf-gitsync-job/.env -RUN echo "TOKEN_REGISTRY_BRANCH_NAME=${TOKEN_REGISTRY_BRANCH_NAME}" >> /cf-gitsync-job/.env -RUN echo "MAPPINGS_FOLDER=${MAPPINGS_FOLDER}" >> /cf-gitsync-job/.env -RUN echo "TESTNET_TOKEN_REGISTRY_REPOSITORY_URL=${TESTNET_TOKEN_REGISTRY_REPOSITORY_URL}" >> /cf-gitsync-job/.env -RUN echo "TESTNET_TOKEN_REGISTRY_BRANCH_NAME=${TESTNET_TOKEN_REGISTRY_BRANCH_NAME}" >> /cf-gitsync-job/.env -RUN echo "TESTNET_MAPPINGS_FOLDER=${TESTNET_MAPPINGS_FOLDER}" >> /cf-gitsync-job/.env -RUN echo "GITSYNC_WORKING_DIR=/cf-gitsync-job" >> /cf-gitsync-job/.env - -RUN pip install -r "requirements.txt" - -# Add the cron job that executes the sync every 2 hours -RUN crontab -l | { cat; echo "* */2 * * * BASH_ENV=/cf-gitsync-job/.env /cf-gitsync-job/sync_from_github.sh >> /var/log/gitsync.log 2>&1"; } | crontab - - -CMD ["cron", "-f"] diff --git a/gitsync/Dockerfile.sync-once b/gitsync/Dockerfile.sync-once deleted file mode 100644 index 0ef5ee5..0000000 --- a/gitsync/Dockerfile.sync-once +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3.9-slim-bullseye - -RUN apt update && apt upgrade -RUN apt install -y git build-essential libpq-dev -RUN git config --global pull.rebase false -WORKDIR /cf-gitsync-job/ -COPY . /cf-gitsync-job/ - -RUN pip install -r "requirements.txt" - -ENTRYPOINT ["/bin/bash", "/cf-gitsync-job/sync_from_github.sh"] diff --git a/gitsync/daos.py b/gitsync/daos.py deleted file mode 100644 index b25855f..0000000 --- a/gitsync/daos.py +++ /dev/null @@ -1,44 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, Text -from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.ext.declarative import declarative_base - -Base = declarative_base() - - -class AbstractMetadataDAO(Base): - __abstract__ = True - - subject = Column(String(255), primary_key=True) - source = Column(String(255), primary_key=True) - policy = Column(Text) - name = Column(String(255)) - ticker = Column(String(32)) - url = Column(String(255)) - description = Column(Text) - decimals = Column(Integer) - updated = Column(DateTime) - updated_by = Column(String(255)) - properties = Column(JSONB) - - -class AbstractLogoDAO(Base): - __abstract__ = True - - subject = Column(String(255), primary_key=True) - source = Column(String(255), primary_key=True) - logo = Column(Text) - - -class SyncControlDAO(Base): - __tablename__ = 'sync_control' - - lock = Column(String(1), primary_key=True) - registry_hash = Column(String(64)) - updated = Column(DateTime) - - -def get_metadata_table_model_instance(tablename: str): - return type(f'MetadataDAO_{tablename}', (AbstractMetadataDAO,), {'__tablename__': tablename}) - -def get_logo_table_model_instance(tablename: str): - return type(f'LogoDAO_{tablename}', (AbstractLogoDAO,), {'__tablename__': tablename}) diff --git a/gitsync/populate_data.py b/gitsync/populate_data.py deleted file mode 100755 index 00a3f56..0000000 --- a/gitsync/populate_data.py +++ /dev/null @@ -1,302 +0,0 @@ -import argparse -from datetime import datetime -import json -import logging -import os -import subprocess -import timeit -import time -from dateutil import parser -from sqlalchemy import create_engine -from sqlalchemy.engine import Engine -from sqlalchemy.orm import Session -from sqlalchemy.sql import text -from daos import get_metadata_table_model_instance, get_logo_table_model_instance, SyncControlDAO - -# process files ins batches of defined -MAX_PROCESSED_FILE_SIZE_OF_SINGLE_BATCH_BYTES = 1024 * \ - 1024 * int(os.getenv('BATCH_SIZE_MB', '64')) - -argparser = argparse.ArgumentParser(description='Process some integers.') -argparser.add_argument('--verbose', type=str, const='true', default='false', - nargs='?', help='verbose output enabled. default false') -argparser.add_argument('--source', type=str, required=True, - nargs='?', help='the name of the source for this sync') -argparser.add_argument('--dburl', type=str, required=False, - nargs='?', help='the database url containing host, port and the database name') -argparser.add_argument('--dbuser', type=str, required=False, - help='the username used to access the database') -argparser.add_argument('--dbsecret', type=str, required=False, - help='the password used to access the database') -argparser.add_argument('--mappings', type=str, required=True, - help='the folder containing the mapping files') - -args = argparser.parse_args() - -if args.verbose.lower() == 'true': - logging.basicConfig(level=logging.DEBUG) -else: - logging.basicConfig(level=logging.INFO) - - -def create_db_engine(user: str, password: str, url: str) -> Engine: - """ Create a SQLAlchemy Engine object based on the given connection parameters. - The connection string will be postgresql+psycopg2://user:password@host:port/dbname - - Args: - user (str): Username for database connection. - password (str): Password for database connection. - url (str): The database url. - - Returns: - Engine: A SQLAlchemy Engine object holding the connection to the database. - """ - logging.info( - 'Trying to connect to postgresql+psycopg2://%s:*****@%s', user, url) - return create_engine(f'postgresql+psycopg2://{user}:{password}@{url}') - - -def validate_mappings_file_contents(mapping: dict) -> bool: - """ Check if an entry is valid. - Check the minimum validity requirements of an entry. This is looser than defined withint CIP-26. Further verification can be done in the downstream processing. - - Args: - mapping (dict): A dict representing the contents of the metadata json file. - - Returns: - True if the entry is considered valid, False otherwise. - """ - if not 'subject' in mapping: - logging.warning('No subject given.') - return False - if 'subject' in mapping: - try: - int(mapping['subject'], 16) - except Exception: - logging.warning('Subject is not hex') - return False - if len(mapping['subject']) < 56 or len(mapping['subject']) % 2 != 0: - logging.warning( - 'Subject too short (less than 28 bytes/56 chars) or has an odd number of characters. %d', len(mapping['subject'])) - return False - if 'policy' in mapping and not mapping['policy'] is None: - try: - int(mapping['policy'], 16) - except Exception: - logging.warning('Policy is not hex') - return False - if len(mapping['policy']) % 2 != 0: - logging.warning( - 'Policy has an odd number of characters. %d', len(mapping['policy'])) - return False - if 'name' in mapping and 'value' in mapping['name'] and len(mapping['name']['value']) > 255: - logging.warning('name property too long') - return False - if 'url' in mapping and 'value' in mapping['url'] and len(mapping['url']['value']) > 255: - logging.warning('url property too long') - return False - if 'ticker' in mapping and 'value' in mapping['ticker'] and len(mapping['ticker']['value']) > 32: - logging.warning('ticker property too long') - return False - - return True - - -def parse_mappings_file_to_dicts(mapping: dict, source: str, author: str, updated: datetime) -> tuple: - """Parses a json file containing metadata mappings - - Args: - mapping (dict): the dict representing the loaded json file - source (str): the name of the data source for the sync - author (str): the author of the latest github commit related to that entry - updated (datetime): the date and time when this entry was modified the last time - - Raises: - ValueError: raised on invalid input - - Returns: - tuple: a dict containing the json data reformatted for insertion into the target database. first element is the subjects metadata, second one the according logo if any. - """ - metadata = [] - logo = [] - if validate_mappings_file_contents(mapping): - subject = mapping['subject'] - metadata.append({ - 'subject': subject, - 'source': source, - 'policy': mapping['policy'] if 'policy' in mapping else None, - 'name': mapping['name']['value'] if 'name' in mapping and 'value' in mapping['name'] else None, - 'ticker': mapping['ticker']['value'] if 'ticker' in mapping and 'value' in mapping['ticker'] else None, - 'url': mapping['url']['value'] if 'url' in mapping and 'value' in mapping['url'] else None, - 'description': mapping['description']['value'] if 'description' in mapping and 'value' in mapping['description'] else None, - 'decimals': mapping['decimals']['value'] if 'decimals' in mapping and 'value' in mapping['decimals'] else None, - 'updated': updated, - 'updated_by': author, - 'properties': mapping - }) - if 'logo' in mapping: - if 'value' in mapping['logo']: - logo.append({ - 'subject': subject, - 'source': source, - 'logo': mapping['logo']['value'] - }) - else: - logging.warning( - 'Logo with no value field within subject %s and source %s.', subject, source) - else: - raise ValueError('Invalid metadata properties.') - - return metadata, logo - - -def populate_data(db_engine: Engine, mappings_folder_path: str, source: str): - """ Iterate over the given mapping_folder_path and process each json file that might contain metadata information. - - Args: - db_engine (Engine): The SQLAlchemy Engine object used to connect to the database. - mappings_folder_path (str): The path of the folder containing the metadata json files. - """ - try: - db_session = Session(db_engine) - - sync_control_data = db_session.get(SyncControlDAO, 'X') - git_commit_hash = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, - stderr=subprocess.PIPE, cwd=mappings_folder_path).stdout.read().decode('utf-8').strip() - if sync_control_data and sync_control_data.registry_hash == git_commit_hash: - logging.info( - 'No updates since last sync. Latest commit hash is %s. Last update was %s', git_commit_hash, str(sync_control_data.updated)) - return - - truncated_current_timestamp = str(int(time.time()))[-6:] - temp_logo_table_name = f'tmp_{truncated_current_timestamp}_logo' - temp_metadata_table_name = f'tmp_{truncated_current_timestamp}_metadata' - tempLogoDaoModelInstance = get_logo_table_model_instance( - temp_logo_table_name) - tempMetadataDaoModelInstance = get_metadata_table_model_instance( - temp_metadata_table_name) - logging.info('Temp table names are %s and %s', - temp_logo_table_name, temp_metadata_table_name) - start_time = timeit.default_timer() - logging.info('Preparing upload %s ...', mappings_folder_path) - db_session.execute( - text(f'CREATE TEMPORARY TABLE "{temp_logo_table_name}" (LIKE "logo");')) - db_session.execute( - text(f'CREATE TEMPORARY TABLE "{temp_metadata_table_name}" (LIKE "metadata");')) - - # iterate over each file - metadata = [] - logos = [] - total_processed_file_size = 0 - files_processed = 0 - storage_processed = 0 - skipped_files = [] - subjects_seen = set() - - for file in os.listdir(mappings_folder_path): - try: - filename = os.fsdecode(file) - if filename.endswith('.json'): - file_path = os.path.join(mappings_folder_path, file) - - # get git metadata - git_metadata = subprocess.Popen(['git', 'log', '-n 1', '--date-order', '--no-merges', '--pretty=format:%aE#-#%aI', filename], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=mappings_folder_path).stdout.read().decode('utf-8').split('#-#') - - with open(file_path, 'r') as mappings_file: - files_processed += 1 - file_size_bytes = os.stat(file_path).st_size - total_processed_file_size += file_size_bytes - storage_processed += file_size_bytes - mappings_content = json.load(mappings_file) - if 'subject' in mappings_content: - if not mappings_content['subject'] in subjects_seen: - subjects_seen.add(mappings_content['subject']) - meta, logo = parse_mappings_file_to_dicts( - mappings_content, source, git_metadata[0], parser.isoparse(git_metadata[1])) - metadata.extend(meta) - logos.extend(logo) - - if total_processed_file_size >= MAX_PROCESSED_FILE_SIZE_OF_SINGLE_BATCH_BYTES: - if len(metadata) > 0: - db_session.bulk_insert_mappings( - tempMetadataDaoModelInstance, metadata) - if len(logos) > 0: - db_session.bulk_insert_mappings( - tempLogoDaoModelInstance, logos) - total_processed_file_size = 0 - metadata = [] - logos = [] - else: - logging.warning('Duplicate subject (%s) detected within file (%s)', mappings_content['subject'], file) - skipped_files.append(file) - else: - logging.warning('Skipping file because there is no subject property. %s', file) - skipped_files.append(file) - except ValueError as exc: - logging.warning( - 'Invalid metadata mappings file %s. Will be skipped.', file) - logging.exception(exc) - if mappings_content: - logging.debug(mappings_content) - skipped_files.append(file) - except Exception as exc: - logging.warning( - 'Error during parsing of file %s. Will be skipped.', file) - logging.exception(exc) - if len(metadata) > 0: - db_session.bulk_insert_mappings( - tempMetadataDaoModelInstance, metadata) - if len(logos) > 0: - db_session.bulk_insert_mappings(tempLogoDaoModelInstance, logos) - - db_session.commit() - logging.info( - 'Duration of data preparation was %s seconds', "{:.2f}".format(timeit.default_timer() - start_time)) - - start_time = timeit.default_timer() - logging.info('Truncating existing data ...') - db_session.execute(text(f'DELETE from "logo" where "source" = \'{source}\';')) - db_session.execute(text(f'DELETE from "metadata" where "source" = \'{source}\';')) - - logging.info( - 'Inserting new data from folder %s ...', mappings_folder_path) - db_session.execute( - text(f'INSERT INTO "metadata"("subject", "source", "policy", "name", "ticker", "url", "description", "decimals", "updated", "updated_by", "properties") SELECT "subject", "source", "policy", "name", "ticker", "url", "description", "decimals", "updated", "updated_by", "properties" FROM "{temp_metadata_table_name}";')) - db_session.execute( - text(f'INSERT INTO "logo" SELECT * FROM "{temp_logo_table_name}";')) - - db_session.commit() - - if sync_control_data: - sync_control_data.registry_hash = git_commit_hash - sync_control_data.updated = datetime.utcnow() - db_session.commit() - else: - sync_control_data = SyncControlDAO() - sync_control_data.lock = 'X' - sync_control_data.registry_hash = git_commit_hash - sync_control_data.updated = datetime.utcnow() - db_session.add(sync_control_data) - db_session.commit() - - db_session.close() - logging.info('Duration of data recreation was %s seconds', - "{:.2f}".format(timeit.default_timer() - start_time)) - logging.info('Done processing %s files containing %s MB of data. Skipped %d files.', - files_processed, "{:.2f}".format(storage_processed/1024/1024), len(skipped_files)) - logging.debug('Skipped files: %s', str(skipped_files)) - except Exception as exc: - logging.error('Could not insert metadata into database.') - logging.exception(exc) - if db_session: - db_session.close() - - -if __name__ == '__main__': - db_engine = create_db_engine(args.dbuser, args.dbsecret, args.dburl) - - logging.info('Source specified is %s', args.source) - populate_data(db_engine, args.mappings, args.source) - - db_engine.dispose() diff --git a/gitsync/requirements.txt b/gitsync/requirements.txt deleted file mode 100644 index 9ed444c..0000000 --- a/gitsync/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -SQLAlchemy==2.0.7 -psycopg2==2.9.5 -python-dateutil==2.8.2 \ No newline at end of file diff --git a/gitsync/sync_from_github.sh b/gitsync/sync_from_github.sh deleted file mode 100755 index 1462fea..0000000 --- a/gitsync/sync_from_github.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# change to working directory -if [[ -z "${GITSYNC_WORKING_DIR}" ]]; then - echo "Remaining in current directory ..." -else - echo "Changing to working directory ..." - cd "${GITSYNC_WORKING_DIR}" -fi - -# populate mainnet registry data -if [[ -z "${TOKEN_REGISTRY_REPOSITORY_URL}" ]]; then - echo "No mainnet registry source url specified." -else - # checkout the repo and target branch - CLONE_FOLDER="registry-data" - echo "Populating data from Github into the database ..." - git clone ${TOKEN_REGISTRY_REPOSITORY_URL} "${CLONE_FOLDER}" - cd "${CLONE_FOLDER}" - git checkout ${TOKEN_REGISTRY_BRANCH_NAME} - git pull - pwd - cd .. - - # call the database sync job - python populate_data.py --source="mainnet" --dburl ${DB_URL} --dbuser ${DB_USER_NAME} --dbsecret ${DB_USER_SECRET} --mappings "/cf-gitsync-job/${CLONE_FOLDER}/${MAPPINGS_FOLDER}" -fi - -# populate testnet registry data (whereas testnet means "not mainnet") -if [[ -z "${TESTNET_TOKEN_REGISTRY_REPOSITORY_URL}" ]]; then - echo "No testnet registry source url specified." -else - #checkout the testnet repo and target branch - CLONE_FOLDER="registry-data-testnet" - echo "Populating data from Github into the database ..." - git clone ${TESTNET_TOKEN_REGISTRY_REPOSITORY_URL} "${CLONE_FOLDER}" - cd "${CLONE_FOLDER}" - git checkout ${TESTNET_TOKEN_REGISTRY_BRANCH_NAME} - git pull - pwd - cd .. - - # call the database sync job for the testnet repository data - python populate_data.py --source="testnet" --dburl ${DB_URL} --dbuser ${DB_USER_NAME} --dbsecret ${DB_USER_SECRET} --mappings "/cf-gitsync-job/${CLONE_FOLDER}/${TESTNET_MAPPINGS_FOLDER}" -fi