diff --git a/README.md b/README.md index 7f49a31..34e52cf 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,13 @@ reporting easier. Operational overview: -1. Python script `load_data.py`: +1. Django management command `load_data`: * downloads a zip clinical trials registry data from ClinicalTrials.gov - * converts the XML to JSON - * uploads it to BigQuery - * runs SQL to transform it to tabular format including fields to - indentify ACTs and their lateness - * downloads SQL as a CSV file + * transforms XML into a CSV file + * all of #2, `process_data` 2. Django management command `process_data`: - * imports CSV file into Django models + * imports existing CSV file into Django models * precomputes aggregate statistics and turns these into rankings * handles other metadata (in particular, hiding trials that are no longer ACTs) @@ -42,12 +39,9 @@ loaded into a staging database / website. A separate command copies new data from staging to production (following moderation). -Much complex logic has been expressed in SQL, which makes it hard to read -and test. This is a legacy of splitting the development between -academics with the domain expertise (and who could use SQL to -prototype) and software engineers. Now the project has been running -for a while and new development interations are less frequent, a useful -project would be as much of this logic to Python. +In the past, importing processes computed and filtered in SQL through +Bigtable service and some JSON processing, but that is largely gone. +You may still see scars. Similarly, the only reason step (1) exists is to create a CSV which can be imported to the database. That CSV is useful in its own right @@ -56,12 +50,6 @@ intermediate formats that could legitimately be dropped in a refactored solution (and the CSV could be generated directly from the database). -The historic reason for the XML -> JSON route is because BigQuery -includes a number of useful JSON functions which can be manipulated by -people competent in SQL. At the time of writing, there -is [an open issue](https://github.com/ebmdatalab/clinicaltrials-act-tracker/issues/121) with -some ideas about refactoring this process. - Static Pages ============ diff --git a/clinicaltrials/common/utils.py b/clinicaltrials/common/utils.py index 598d5c6..229006c 100644 --- a/clinicaltrials/common/utils.py +++ b/clinicaltrials/common/utils.py @@ -2,8 +2,10 @@ import django.core.exceptions -# Originally taken from openprescribing -def get_env_setting(setting, default=None): +Unset = object() # Explicit default value None is not the same as lacking a default. + + +def get_env_setting(setting, default=Unset): """ Get the environment setting. Return the default, or raise an exception if none supplied @@ -11,8 +13,8 @@ def get_env_setting(setting, default=None): try: return os.environ[setting] except KeyError: - if default: - return default - else: + if default is Unset: error_msg = "Set the %s env variable" % setting raise django.core.exceptions.ImproperlyConfigured(error_msg) + + return default diff --git a/clinicaltrials/frontend/management/commands/load_data.py b/clinicaltrials/frontend/management/commands/load_data.py index fb66532..46a51e4 100644 --- a/clinicaltrials/frontend/management/commands/load_data.py +++ b/clinicaltrials/frontend/management/commands/load_data.py @@ -58,7 +58,7 @@ def download_and_extract(): # download and extract container = tempfile.mkdtemp( - prefix=settings.STORAGE_PREFIX.rstrip(os.sep), dir=settings.WORKING_VOLUME) + prefix=settings.STORAGE_PREFIX, dir=settings.WORKING_VOLUME) try: data_file = os.path.join(container, "data.zip") wget_file(data_file, url) @@ -76,7 +76,7 @@ def upload_to_cloud(): client = StorageClient() bucket = client.get_bucket() blob = bucket.blob( - "{}{}".format(settings.STORAGE_PREFIX, raw_json_name()), + "{}/{}".format(settings.STORAGE_PREFIX, raw_json_name()), chunk_size=1024*1024 ) with open(os.path.join(settings.WORKING_DIR, raw_json_name()), 'rb') as f: @@ -89,7 +89,11 @@ def notify_slack(message): # Set the webhook_url to the one provided by Slack when you create # the webhook at # https://my.slack.com/services/new/incoming-webhook/ - webhook_url = os.environ['SLACK_GENERAL_POST_KEY'] + webhook_url = settings.SLACK_GENERAL_POST_KEY + + if webhook_url is None: + return + slack_data = {'text': message} response = requests.post(webhook_url, json=slack_data) @@ -164,7 +168,7 @@ def convert_and_download(): wait_for_job(job) - t1_exporter = TableExporter(tmp_table, settings.STORAGE_PREFIX + 'test_table-') + t1_exporter = TableExporter(tmp_table, settings.STORAGE_PREFIX + '/' + 'test_table-') t1_exporter.export_to_storage() with open(settings.INTERMEDIATE_CSV_PATH, 'w') as f: @@ -172,6 +176,8 @@ def convert_and_download(): def get_env(path): + """Terrible hack to bridge using env vars to having settings files.""" + if not path: return {} env = os.environ.copy() with open(path) as e: for k, v in re.findall(r"^export ([A-Z][A-Z0-9_]*)=(\S*)", e.read(), re.MULTILINE): @@ -184,7 +190,7 @@ def process_data(): try: subprocess.check_output( [ - "{}python".format(settings.PROCESSING_VENV_BIN), + shutil.which("python"), "{}/manage.py".format(settings.BASE_DIR), "process_data", "--input-csv={}".format(settings.INTERMEDIATE_CSV_PATH), diff --git a/clinicaltrials/frontend/management/commands/process_data.py b/clinicaltrials/frontend/management/commands/process_data.py index 5af0eef..68f7be4 100644 --- a/clinicaltrials/frontend/management/commands/process_data.py +++ b/clinicaltrials/frontend/management/commands/process_data.py @@ -109,22 +109,21 @@ def set_qa_metadata(trial): def _compute_ranks(): - sql = ("WITH ranked AS (SELECT date, ranking.id, RANK() OVER (" - " PARTITION BY date " - "ORDER BY percentage DESC" - ") AS computed_rank " - "FROM frontend_ranking ranking WHERE percentage IS NOT NULL " - "AND date = %s" - ") ") - - sql += ("UPDATE " - " frontend_ranking " - "SET " - " rank = ranked.computed_rank " - "FROM ranked " - "WHERE ranked.id = frontend_ranking.id AND ranked.date = frontend_ranking.date") + sql = """ + UPDATE frontend_ranking + SET rank = ( + SELECT + count(*) + FROM + frontend_ranking AS f + WHERE + f.percentagefaiv%ua8Uep)ai-zi!thee2xee8a') # SECURITY WARNING: don't run with debug turned on in production! -CLINICALTRIALS_DEBUG = common.utils.get_env_setting('CLINICALTRIALS_DEBUG') +CLINICALTRIALS_DEBUG = get_env_setting('CLINICALTRIALS_DEBUG', 'yes') assert CLINICALTRIALS_DEBUG in ['yes', 'no'], "CLINICALTRIALS_DEBUG was '{}'".format(CLINICALTRIALS_DEBUG) DEBUG = CLINICALTRIALS_DEBUG == 'yes' -ALLOWED_HOSTS = [ - 'staging-fdaaa.ebmdatalab.net', '127.0.0.1', '192.168.0.55', 'localhost', - 'fdaaa.trialstracker.net'] +ALLOWED_HOSTS = ['fdaaa.trialstracker.net'] +if DEBUG: + ALLOWED_HOSTS.extend(['staging-fdaaa.ebmdatalab.net', '127.0.0.1', '192.168.0.55', 'localhost']) + +if DEBUG: + DEFAULT_DB_NAME = "storage.sqlite3" + DEFAULT_DB_USER = None + DEFAULT_DB_PASSWORD = None + DEFAULT_DB_HOST = None + DEFAULT_DB_ENGINE = 'django.db.backends.sqlite3' +else: + DEFAULT_DB_NAME = Unset + DEFAULT_DB_USER = Unset + DEFAULT_DB_PASSWORD = Unset + DEFAULT_DB_HOST = 'localhost' + DEFAULT_DB_ENGINE = 'django.db.backends.postgresql_psycopg2', # Parameters -GOOGLE_TRACKING_ID = common.utils.get_env_setting('CLINICALTRIALS_GOOGLE_TRACKING_ID') +GOOGLE_TRACKING_ID = get_env_setting('CLINICALTRIALS_GOOGLE_TRACKING_ID', None if DEBUG else Unset) +SLACK_GENERAL_POST_KEY = get_env_setting('SLACK_GENERAL_POST_KEY', None if DEBUG else Unset) # Application definition @@ -181,16 +196,15 @@ WSGI_APPLICATION = 'frontend.wsgi.application' - # Database # https://docs.djangoproject.com/en/1.10/ref/settings/#databases DATABASES = { 'default': { - 'ENGINE': 'django.db.backends.postgresql_psycopg2', - 'NAME': common.utils.get_env_setting('CLINICALTRIALS_DB'), - 'USER': common.utils.get_env_setting('CLINICALTRIALS_DB_NAME'), - 'PASSWORD': common.utils.get_env_setting('CLINICALTRIALS_DB_PASS'), - 'HOST': 'localhost', + 'ENGINE': get_env_setting('CLINICALTRIALS_DB', DEFAULT_DB_ENGINE), + 'NAME': get_env_setting('CLINICALTRIALS_DB', DEFAULT_DB_NAME), + 'USER': get_env_setting('CLINICALTRIALS_DB_NAME', DEFAULT_DB_USER), + 'PASSWORD': get_env_setting('CLINICALTRIALS_DB_PASS', DEFAULT_DB_PASSWORD), + 'HOST': get_env_setting('CLINICALTRIALS_DB_PASS', DEFAULT_DB_HOST), 'CONN_MAX_AGE': 0 # Must be zero, see api/view_utils#db_timeout }, } @@ -263,16 +277,15 @@ # Twitter -TWITTER_CONSUMER_SECRET = common.utils.get_env_setting('TWITTER_CONSUMER_SECRET') -TWITTER_ACCESS_TOKEN_SECRET = common.utils.get_env_setting('TWITTER_ACCESS_TOKEN_SECRET') +TWITTER_CONSUMER_SECRET = get_env_setting('TWITTER_CONSUMER_SECRET', None if DEBUG else Unset) +TWITTER_ACCESS_TOKEN_SECRET = get_env_setting('TWITTER_ACCESS_TOKEN_SECRET', None if DEBUG else Unset) # Path to shell script of lines `export FOO=bar`. See environment-example for a sample. -PROCESSING_ENV_PATH = '/etc/profile.d/fdaaa_staging.sh' -PROCESSING_VENV_BIN = '/var/www/fdaaa_staging/venv/bin/' +PROCESSING_ENV_PATH = '/etc/profile.d/fdaaa_staging.sh' if not DEBUG else None PROCESSING_STORAGE_TABLE_NAME = 'current_raw_json' # Bucket in GCS to store data -STORAGE_PREFIX = 'clinicaltrials/' -WORKING_VOLUME = '/mnt/volume-lon1-01/' # should have at least 10GB space +STORAGE_PREFIX = 'trialdata' +WORKING_VOLUME = get_env_setting('WORKDIR', '/mnt/volume-lon1-01/' if not DEBUG else os.curdir) # should have at least 10GB space WORKING_DIR = os.path.join(WORKING_VOLUME, STORAGE_PREFIX) INTERMEDIATE_CSV_PATH = os.path.join(WORKING_VOLUME, STORAGE_PREFIX, 'clinical_trials.csv')