From 818019303245da364ed0eb9918ee315c32f1df88 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Fri, 17 Nov 2023 23:35:34 +0000 Subject: [PATCH 01/11] adding adam categories for weekly app categories --- README.md | 16 ++++-- bin/dump_weekly_ADAM_app_categories.sh | 3 + source/custom_scripts/dump_query_results.py | 5 ++ .../dump_weekly_ADAM_app_categories.py | 56 +++++++++++++++++++ 4 files changed, 76 insertions(+), 4 deletions(-) create mode 100755 bin/dump_weekly_ADAM_app_categories.sh create mode 100644 source/custom_scripts/dump_weekly_ADAM_app_categories.py diff --git a/README.md b/README.md index 283a7d5..a612fcd 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,15 @@ source/daily/upload_public_narratives_count.py source/daily/make_reporting_tables.py +------------------- +Within the logstash dockerfile there is: +https://github.com/kbase/logstash/blob/41778da1238129a65296bdddcb6ff26e9c694779/Dockerfile#L24-L29 +The rm at the end I believe is just cleaning up after itself. This was set up by Steve for Cheyenne's work +This is used by this code: +https://github.com/kbase/metrics/blob/master/source/daily_cron_jobs/methods_upload_elasticsearch_sumrydicts.py + + + ------------------- CRON Jobs are run from mysql-metrics @@ -53,10 +62,10 @@ There are nightly CRON jobs that get run are located in bin/master_cron_shell.sh which runs scripts from the source/daily directory Then there are also monthly CRON jobs that get run are located in bin/upload_workspace_stats.sh -It used to be workspaces (user info needed first for FK potential issues), but now it also conatins scripts for -DOI metrics.) +It used to be workspaces (user info needed first for FK potential issues), Runs scripts from source/monthly directory +There is a doi_monthly CRON job for Credit Engine that runs are located in bin/upload_doi_metrics.sh These create Logs to keep track of (note nightly metrics is calling master_cron_shell 01 17 * * * /root/metrics/nightly_metrics.sh >>/mnt/metrics_logs/crontab_nightly 2>&1 @@ -64,12 +73,11 @@ These create Logs to keep track of (note nightly metrics is calling master_cron_ 01 0 15 * * /root/metrics/monthly_metrics.sh >>/mnt/metrics_logs/crontab_doi_monthly 2>&1 01 07 * * * /root/metrics/nightly_errorlogs.sh >>/mnt/metrics_logs/crontab_errorlogs 2>&1 -From Docker03 the logs can be checked by going doing the following. (Note no y at end of monthly) +From Docker03 the logs can be checked by going doing the following. cat /mnt/nfs3/data1/metrics/crontab_logs/crontab_nightly cat /mnt/nfs3/data1/metrics/crontab_logs/crontab_monthly cat /mnt/nfs3/data1/metrics/crontab_logs/crontab_doi_monthly - Can also confirm things ran by looking in the database (if not need to do backfills). Example: (should be first of each month) select DATE_FORMAT(`record_date`,'%Y-%m') as narrative_cron_month, count(*) as narrative_count from metrics.workspaces ws group by narrative_cron_month; diff --git a/bin/dump_weekly_ADAM_app_categories.sh b/bin/dump_weekly_ADAM_app_categories.sh new file mode 100755 index 0000000..70bb542 --- /dev/null +++ b/bin/dump_weekly_ADAM_app_categories.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python custom_scripts/dump_weekly_ADAM_app_categories.py diff --git a/source/custom_scripts/dump_query_results.py b/source/custom_scripts/dump_query_results.py index 1aa32ea..ab8486c 100644 --- a/source/custom_scripts/dump_query_results.py +++ b/source/custom_scripts/dump_query_results.py @@ -88,6 +88,11 @@ def dump_query_results(): # order by avg_hours_active desc, session_count, total_hours_active") #print("username\tsession_count\ttotal_hours_active\tavg_hours_active\tstd_hours_active\tfirst_seen\tlast_seen") + # Custom apps updates for RSV +# query = ("select app_name, git_commit_hash, min(finish_date) as first_run_date from user_app_usage \ +# group by app_name, git_commit_hash having first_run_date > '2021-01-01'") +# print("appname\tgit_commit_hash\tfirst_run_date") + #Blobstore cumulative sizes over users # query = ("select sum(total_size) as blobstore_size, bs.username from blobstore_stats bs \ # group by username order by blobstore_size") diff --git a/source/custom_scripts/dump_weekly_ADAM_app_categories.py b/source/custom_scripts/dump_weekly_ADAM_app_categories.py new file mode 100644 index 0000000..3fc67ae --- /dev/null +++ b/source/custom_scripts/dump_weekly_ADAM_app_categories.py @@ -0,0 +1,56 @@ +#!/usr/local/bin/python + +import os +import mysql.connector as mysql + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +metrics = os.environ["QUERY_ON"] + + +def dump_weekly_app_categories(): + # Dumps the weekly app catagory users report used in the quarterly report + + # connect to mysql + db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" + ) + + cursor = db_connection.cursor() + query = "use " + metrics + cursor.execute(query) + + # CHANGE QUERY HERE + query = ("select * from metrics_reporting.app_category_unique_users_weekly") + query = ("select in_query.week_run, in_query.master_category, count(*) as unique_users " + "from (select distinct DATE_FORMAT(`finish_date`,'%Y-%u') as week_run, " + "IFNULL(master_category,'None') as master_category, uau.username " + "from metrics.user_app_usage uau inner join " + "metrics.user_info ui on uau.username = ui.username " + "left outer join " + "metrics.adams_app_name_category_map anc on uau.app_name = anc.app_name " + "where ui.kb_internal_user = 0 " + "and func_name != 'kb_gtdbtk/run_kb_gtdbtk') as in_query " + "group by in_query.week_run, in_query.master_category;") + # CHANGE COLUMN HEADERS HERE TO MATCH QUERY HEADERS + print("week_run\tmaster_category\tunique_users") + + cursor.execute(query) + row_values = list() + + for row_values in cursor: + temp_string = "" + for i in range(len(row_values) - 1): + if row_values[i] is not None: + temp_string += str(row_values[i]) + temp_string += "\t" + if row_values[-1] is not None: + temp_string += str(row_values[-1]) + print(temp_string) + return 1 + + +dump_weekly_app_categories() From e036d3801a28379f1444cc1ad088093632c1c864 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Fri, 27 Sep 2024 02:31:46 +0000 Subject: [PATCH 02/11] blobstore_reports --- .../daily_cron_jobs/make_reporting_tables.py | 63 ++++++++++++++- .../sql_create_statements.txt | 21 ++++- .../sql_reporting_views_and_tables.sql | 77 ++++++++++++++++++- 3 files changed, 158 insertions(+), 3 deletions(-) diff --git a/source/daily_cron_jobs/make_reporting_tables.py b/source/daily_cron_jobs/make_reporting_tables.py index 49f112d..69f8024 100644 --- a/source/daily_cron_jobs/make_reporting_tables.py +++ b/source/daily_cron_jobs/make_reporting_tables.py @@ -340,10 +340,71 @@ def make_reporting_tables(): "where uip.exclude != 1 ") cursor.execute(user_super_summary_create_statement) print("user_super_summary_create_statement created") + + + # Blobstroe detial related tables + blobstore_detail_by_ws_create_statement = ( + "create or replace table blobstore_detail_by_ws as " + "(select in_q.ws_id, sum(in_q.orig_saver_count) as orig_saver_count, " + "sum(in_q.non_orig_saver_count) as non_orig_saver_count, " + "sum(in_q.orig_saver_size_GB) as orig_saver_size_GB, " + "sum(in_q.non_orig_saver_size_GB) as non_orig_saver_size_GB, " + "sum(in_q.total_blobstore_size_GB) as total_blobstore_size_GB " + "from (" + "select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by ws_id, month) in_q " + "group by ws_id ) ") + cursor.execute(blobstore_detail_by_ws_create_statement) + print("blobstore_detail_by_ws_create_statement created") + + blobstore_detail_by_user_monthly_create_statement = ( + "create or replace table blobstore_detail_by_user_monthly as " + "(select saver_username, DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by saver_username, month) ") + cursor.execute(blobstore_detail_by_user_monthly_create_statement) + print("blobstore_detail_by_user_monthly_create_statement created") - return + blobstore_detail_by_user_create_statement = ( + "create or replace table blobstore_detail_by_user as " + "(select saver_username, " + "sum(orig_saver_count) as orig_saver_count, sum(non_orig_saver_count) as non_orig_saver_count, " + "sum(orig_saver_size_GB) as orig_saver_size_GB, " + "sum(non_orig_saver_size_GB) as non_orig_saver_size_GB, " + "sum(total_blobstore_size_GB) as total_blobstore_size_GB " + "from blobstore_detail_by_user_monthly " + "group by saver_username) ") + cursor.execute(blobstore_detail_by_user_create_statement) + print("blobstore_detail_by_user_create_statement created") + + blobstore_detail_by_object_type_monthly_create_statement = ( + "create or replace table blobstore_detail_by_object_type_monthly as " + "(select LEFT(object_type,LOCATE('-',object_type) - 1) as object_type, " + "DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by object_type, month) ") + cursor.execute(blobstore_detail_by_object_type_monthly_create_statement) + print("blobstore_detail_by_object_type_monthly_create_statement created") + + + + return + import time import datetime diff --git a/sql_create_statements/sql_create_statements.txt b/sql_create_statements/sql_create_statements.txt index d48d982..48af83c 100644 --- a/sql_create_statements/sql_create_statements.txt +++ b/sql_create_statements/sql_create_statements.txt @@ -1,3 +1,4 @@ + --###################### --# user_info table create and indices. @@ -311,7 +312,7 @@ ON public_narrative_count(public_narrative_count,record_date); CREATE or replace TABLE metrics.session_info ( username VARCHAR(255) NOT NULL, record_date DATE NOT NULL, - ip_address VARCHAR(15) NOT NULL, + ip_address VARCHAR(40) NOT NULL, country_name VARCHAR(255) NOT NULL, country_code VARCHAR(3) NOT NULL, city VARCHAR(255) NOT NULL, @@ -827,3 +828,21 @@ CREATE TABLE `blobstore_detail` ( KEY `idx_bsd_objecttype` (`object_type`), CONSTRAINT `fk_bsd_username` FOREIGN KEY (`saver_username`) REFERENCES `user_info` (`username`) ON UPDATE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8; + + +--################## +-- past narrative copies. Need to store, because other's do not want to touch the WS and correct the data. +-- stores the past narrative copy information that was able to be determined, before narrative copy/tracking was fixed/implemented. +--################## +CREATE TABLE `past_narrative_copies` ( + `source_narrative_id` int(15) NOT NULL, + `source_narrative_upa` varchar(255) NOT NULL, + `destination_narrative_id` int(15) NOT NULL, + `destination_narrative_upa` varchar(255) NOT NULL, + `destination_narrative_save_date` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + UNIQUE KEY `uk_destination_narrative_id_pnc` (`destination_narrative_id`), + KEY `idx_source_narrative_id_pnc` (`source_narrative_id`), + KEY `idx_source_narrative_upa_pnc` (`source_narrative_upa`), + KEY `idx_destination_narrative_save_date_pnc` (`destination_narrative_save_date`) +) ENGINE=InnoDB; + diff --git a/sql_create_statements/sql_reporting_views_and_tables.sql b/sql_create_statements/sql_reporting_views_and_tables.sql index cf72a3a..1d5c016 100644 --- a/sql_create_statements/sql_reporting_views_and_tables.sql +++ b/sql_create_statements/sql_reporting_views_and_tables.sql @@ -1450,7 +1450,7 @@ and is_temporary = 0 group by wc.username, ui.kb_internal_user; #------------------------------ -Final user_super_summary table +# Final user_super_summary table #------------------------------ # NEEDS A CRON JOB @@ -1606,3 +1606,78 @@ from metrics.doi_ws_map dwm inner join metrics_reporting.doi_metrics_current dmc on dwm.ws_id = dmc.ws_id inner join metrics_reporting.workspaces_current wc on dmc.ws_id = wc.ws_id order by dwm.doi_url,dwm.is_parent_ws desc); + + +#------------------------------ +# Blobstore_detail reports +# Note massive table. Some of these are done in CRON job as tables, other are views. +#------------------------------ + +create or replace table blobstore_detail_by_ws as +( +select in_q.ws_id, sum(in_q.orig_saver_count) as orig_saver_count, +sum(in_q.non_orig_saver_count) as non_orig_saver_count, +sum(in_q.orig_saver_size_GB) as orig_saver_size_GB, +sum(in_q.non_orig_saver_size_GB) as non_orig_saver_size_GB, +sum(in_q.total_blobstore_size_GB) as total_blobstore_size_GB +from +(select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, +sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, +sum(orig_saver * size)/1000000000 as orig_saver_size_GB, +0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, +sum(size)/1000000000 as total_blobstore_size_GB +from blobstore_detail bd +group by ws_id, month) in_q +group by ws_id ); +Query OK, 108871 rows affected (6 min 52.38 sec) +Records: 108871 Duplicates: 0 Warnings: 0 + +create or replace view blobstore_detail_by_ws_monthly as +(select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, +sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, +sum(orig_saver * size)/1000000000 as orig_saver_size_GB, +0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, +sum(size)/1000000000 as total_blobstore_size_GB +from blobstore_detail bd +group by ws_id, month); + + +create or replace table blobstore_detail_by_user_monthly as +(select saver_username, DATE_FORMAT(`save_date`,'%Y-%m') as month, +sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, +sum(orig_saver * size)/1000000000 as orig_saver_size_GB, +0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, +sum(size)/1000000000 as total_blobstore_size_GB +from blobstore_detail bd +group by saver_username, month); + +create or replace table blobstore_detail_by_user as +(select saver_username, +sum(orig_saver_count) as orig_saver_count, sum(non_orig_saver_count) as non_orig_saver_count, +sum(orig_saver_size_GB) as orig_saver_size_GB, +sum(non_orig_saver_size_GB) as non_orig_saver_size_GB, +sum(total_blobstore_size_GB) as total_blobstore_size_GB +from blobstore_detail_by_user_monthly +group by saver_username); + + +create or replace table blobstore_detail_by_object_type_monthly as +(select LEFT(object_type,LOCATE('-',object_type) - 1) as object_type, +DATE_FORMAT(`save_date`,'%Y-%m') as month, +sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, +sum(orig_saver * size)/1000000000 as orig_saver_size_GB, +0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, +sum(size)/1000000000 as total_blobstore_size_GB +from blobstore_detail bd +group by object_type, month); + +create or replace view blobstore_detail_by_object_type as +(select object_type, +sum(orig_saver_count) as orig_saver_count, +sum(non_orig_saver_count) as non_orig_saver_count, +sum(orig_saver_size_GB) as orig_saver_size_GB, +sum(non_orig_saver_size_GB) as non_orig_saver_size_GB, +sum(total_blobstore_size_GB) as total_blobstore_size_GB +from blobstore_detail_by_object_type_monthly +group by object_type); + From 163b1a251c22e287cc0431f3633b527e914f1d90 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Tue, 1 Oct 2024 23:56:16 +0000 Subject: [PATCH 03/11] more code for blobstore details and narrative copy/duplicate info --- .../get_copy_info_for_narratives.py | 194 ++++++ .../get_duplicate_narrative_object_ids.py | 187 ++++++ ...pulate_orphaned_blobstore_nodes_handles.py | 293 ++++++++ .../upload_get_copy_info_for_narratives.py | 367 +++++++++++ .../methods_upload_blobstore_details.py | 623 ++++++++++++++++++ .../upload_blobstore_details.py | 28 + 6 files changed, 1692 insertions(+) create mode 100644 source/custom_scripts/get_copy_info_for_narratives.py create mode 100644 source/custom_scripts/get_duplicate_narrative_object_ids.py create mode 100644 source/custom_scripts/populate_orphaned_blobstore_nodes_handles.py create mode 100644 source/custom_scripts/upload_get_copy_info_for_narratives.py create mode 100644 source/daily_cron_jobs/methods_upload_blobstore_details.py create mode 100644 source/daily_cron_jobs/upload_blobstore_details.py diff --git a/source/custom_scripts/get_copy_info_for_narratives.py b/source/custom_scripts/get_copy_info_for_narratives.py new file mode 100644 index 0000000..225df03 --- /dev/null +++ b/source/custom_scripts/get_copy_info_for_narratives.py @@ -0,0 +1,194 @@ +from pymongo import MongoClient +from pymongo import ReadPreference +#from biokbase.workspace.client import Workspace +#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService +#from biokbase.service.Client import Client as ServiceClient +#import json as _json +import os +import mysql.connector as mysql +import requests +import time +#from splitting import split_sequence +from datetime import date +from datetime import datetime + +debug_mode = 1 + +if debug_mode == 1: + print("############################################") + print("############################################") + print("############################################") + print("START TIME (UTC): " + str(datetime.utcnow())) + +start_time = time.time() + +requests.packages.urllib3.disable_warnings() + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] + +ws_url = os.environ["WS_URL"] +ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] +to_workspace = os.environ["WRK_SUFFIX"] + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +metrics = os.environ["QUERY_ON"] + +# connect to mysql +db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" +) + +cursor = db_connection.cursor() +query = "use " + metrics +cursor.execute(query) + +workspaces_with_copied_reports_and_no_narratives = list() + +client = MongoClient(mongoDB_metrics_connection + to_workspace) +db = client.workspace + +# dict soucce_ws => {destination_ws => min_savedate (MIGHT NEED NARRATIVE OBJECT NUMBER) +source_ws_to_destination_ws_dict = dict() +destination_ws_set = set() + +# Key destination ws_id , key = object id of the narrative +destination_narrative_obj_id_lookup = dict() + +# Final results object; +# Key = narrative_obj_id , value = ws_obj_version of the source ws object: +destination_results_dict = dict() + +# get unique list of Report types: +query = ('select object_type, object_type_full from metrics_reporting.workspace_object_counts_current where object_type like "KBaseReport.Report%"') + +cursor.execute(query) +row_values = list() + +report_list = list() +for row_values in cursor: + report_list.append(row_values[1]) + +# GET THE INITIAL INFORMATION ABOUT COPIED REPORTS TO EXTRAPOLATE COPIED NARRATIVES: +ws_objVersions_copied_reports_cursor = db.workspaceObjVersions.find({"type":{"$in":report_list}, + "copied":{"$ne": None} + #, "ws":{"$in":[145373, 43266, 116952, 154109]} + }, + {"ws": 1, "_id": 0, "savedate": 1, "copied" : 1 }) + +for ws_objVersions_copied_report in ws_objVersions_copied_reports_cursor: + destination_ws = ws_objVersions_copied_report["ws"] + savedate = ws_objVersions_copied_report["savedate"] + copied_from = ws_objVersions_copied_report["copied"] + source_ws = int(copied_from.split("/")[0]) + destination_ws_set.add(destination_ws) + if source_ws not in source_ws_to_destination_ws_dict: + source_ws_to_destination_ws_dict[source_ws] = dict() + if destination_ws not in source_ws_to_destination_ws_dict[source_ws]: + source_ws_to_destination_ws_dict[source_ws][destination_ws] = dict() + if "creation_date" not in source_ws_to_destination_ws_dict[source_ws][destination_ws]: + source_ws_to_destination_ws_dict[source_ws][destination_ws]["creation_date"] = savedate + else: + if savedate < source_ws_to_destination_ws_dict[source_ws][destination_ws]["creation_date"]: + source_ws_to_destination_ws_dict[source_ws][destination_ws]["creation_date"] = savedate + +if debug_mode == 1: + print("source_ws_to_destination_ws_dict: " + str(source_ws_to_destination_ws_dict)) +# +#split the copy get source WS, fill in Datastructure, replace the min_date accordingly. + # + # + +# GET THE DESTINATION WS NARRATIVE OBJECT ID +# Has the obj id (middlw part of UPA) of the narrative obj in the new WS. Copied narratives are not object 1, but rather +# the max object id in source ws (at time of the copy) + 1 +destination_narratives_ids_lookup = dict() + +#get narrative typed objects +query = ('select object_type, object_type_full from metrics_reporting.workspace_object_counts_current where object_type like "KBaseNarrative.Narrative%"') + +cursor.execute(query) +row_values = list() + +narrative_type_list = list() +for row_values in cursor: + narrative_type_list.append(row_values[1]) + +destination_narrative_ids_cursor = db.workspaceObjVersions.find({"type":{"$in":narrative_type_list}, + "ws":{"$in":list(destination_ws_set)}, + "ver":1}, + {"ws":1, "id":1, "_id":0}) + +for dest_narrative_ws_id in destination_narrative_ids_cursor: + destination_narrative_obj_id_lookup[dest_narrative_ws_id["ws"]] = dest_narrative_ws_id["id"] + +if debug_mode == 1: + print("destination_narrative_obj_id_lookup : " + str(destination_narrative_obj_id_lookup)) + + +# GET THE COPIED FROM NARRATIVES TIMESTAMPS OF THEIR VERSIONS TO HAVE A LOOKUP FOR THE +for source_ws_id in source_ws_to_destination_ws_dict: + ordered_save_points = list() + source_version_save_points_cursor = db.workspaceObjVersions.find({"type":"KBaseNarrative.Narrative-4.0", + "ws":source_ws_id}, + {"id":1, "ver":1, "savedate":1, "_id":0}).sort("savedate") + for source_version_save_point in source_version_save_points_cursor: + source_obj_id = str(source_ws_id) + "/" + str(source_version_save_point["id"]) + "/" + str(source_version_save_point["ver"]) + savedate = source_version_save_point["savedate"] + ordered_save_points.append([savedate,source_obj_id]) + if debug_mode == 1: + print("ordered_save_points : " + str(ordered_save_points)) + + for destination_ws_id in source_ws_to_destination_ws_dict[source_ws_id]: + destination_ws_savedate = source_ws_to_destination_ws_dict[source_ws_id][destination_ws_id]["creation_date"] + source_obj_id_used = None + for ordered_save_point in ordered_save_points: + if ordered_save_point[0] <= destination_ws_savedate: + source_obj_id_used = ordered_save_point[1] + else: + break + if source_obj_id_used == None: + if debug_mode == 1: + print("ERROR: " + str(destination_ws_id) + " does not a source ws_obj that it found, could be due to saved REPORT indipendently") + if destination_ws_id not in destination_narrative_obj_id_lookup: + if debug_mode == 1: + print("It is a WS without a narrative object") + workspaces_with_copied_reports_and_no_narratives.append(destination_ws_id) + continue + destination_narrative_obj_id = str(destination_ws_id) + "/" + str(destination_narrative_obj_id_lookup[destination_ws_id]) + "/1" + destination_results_dict[destination_narrative_obj_id] = source_obj_id_used + +if debug_mode == 1: + print("destination_results_dict : " + str(destination_results_dict)) + print("===============================") + print("===============================") + print("===============================") + +destination_obj_id_is_none = list() + +narrative_copy_count = 0 +print("Destination_WS\tSource_WS") +for destination_obj_id in destination_results_dict: + if destination_results_dict[destination_obj_id] == None: + destination_obj_id_is_none.append(destination_obj_id) + continue + print(destination_obj_id + "\t" + destination_results_dict[destination_obj_id]) + narrative_copy_count += 1 + +if debug_mode == 1: + print("DESTINATION WORKSPACES HAVE NO NARRATIVE workspaces_with_copied_reports_and_no_narratives : " + str(workspaces_with_copied_reports_and_no_narratives)) + print("workspaces_with_copied_reports_and_no_narratives length " + str(len(workspaces_with_copied_reports_and_no_narratives))) + + print("SOURCE WS DOES NOT HAVE A NARRATIVE::::::::destination_obj_id_is_none : " + str(destination_obj_id_is_none)) + print("destination_obj_id_is_none length : " + str(len(destination_obj_id_is_none))) + + + print("destination_narrative_obj_id_lookup length: " + str(len(destination_narrative_obj_id_lookup))) + print("destination_results_dict length: " + str(len(destination_results_dict))) + + print("total narrative_copy_count : " + str(narrative_copy_count)) + print("--- total seconds %s seconds ---" % (time.time() - start_time)) +exit() diff --git a/source/custom_scripts/get_duplicate_narrative_object_ids.py b/source/custom_scripts/get_duplicate_narrative_object_ids.py new file mode 100644 index 0000000..b1ce05c --- /dev/null +++ b/source/custom_scripts/get_duplicate_narrative_object_ids.py @@ -0,0 +1,187 @@ +#import pymongo +from pymongo import MongoClient +from pymongo import ReadPreference +#from biokbase.workspace.client import Workspace +#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService +#from biokbase.service.Client import Client as ServiceClient +#import json as _json +import os +import mysql.connector as mysql +import requests +import time +#from splitting import split_sequence +from datetime import date +from datetime import datetime + +debug_mode = 1 + +if debug_mode == 1: + print("############################################") + print("############################################") + print("############################################") + print("START TIME (UTC): " + str(datetime.utcnow())) + +start_time = time.time() + +requests.packages.urllib3.disable_warnings() + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] + +ws_url = os.environ["WS_URL"] +ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] +to_workspace = os.environ["WRK_SUFFIX"] + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +metrics = os.environ["QUERY_ON"] + +# connect to mysql +db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" +) + +cursor = db_connection.cursor() +query = "use " + metrics +cursor.execute(query) + +client = MongoClient(mongoDB_metrics_connection + to_workspace) +db = client.workspace + +workspaces_without_corresponding_versions_data = list() + +#workspaces_with_multiple_narrative_obj_ids = dict() + +############################## +# +# get the list of narrative_typed objects +# +############################ +def get_narrative_typed_objects_list(cursor): + # get list of narrative typed objects on the system + query = ('select object_type, object_type_full from metrics_reporting.workspace_object_counts_current where object_type like "KBaseNarrative.Narrative%"') + cursor.execute(query) + narrative_type_list = list() + for row_values in cursor: + narrative_type_list.append(row_values[1]) + return narrative_type_list + +########################3 +# +# get_ws_narratives with duplicate narrative_ids +# +##################### +def get_multiple_narratives_count_dict(cursor): + # get list of narrative typed objects on the system + query = ('select ws_id, num_nar_obj_ids from metrics_reporting.workspaces_current where num_nar_obj_ids > 1') + cursor.execute(query) + multiple_narrative_count_dict = dict() + for row_values in cursor: + multiple_narrative_count_dict[row_values[0]] = row_values[1] +# print(" multiple_narrative_count_dict : " + str(multiple_narrative_count_dict)) +# print(" multiple_narrative_count_dict length : " + str(len(multiple_narrative_count_dict))) + return multiple_narrative_count_dict + +#################### +# +# get active narrative for all of these workspaces (note may have by name) +# Then get the list of all non_active obj_ids for these narratives +# Confirm the length of each list is n-1 relative to the count list +# +################## +def get_non_active_narrative_object_ids(narrative_type_list, multiple_narrative_count_dict, db): + narrative_active_id_dict = dict() + list_of_workspace_to_check = list(multiple_narrative_count_dict.keys()) +# print("list_of_workspace_to_check len : " + str(len(list_of_workspace_to_check))) + + ws_narratives_dict = dict() +# + narrative_obj_ids_not_int_dict = dict() #key ws -> value the narrative value +# + narrative_obj_ids_not_int_never_resolved = dict() #key ws -> value the narrative value +# + meta_without_narrative_count = 0 +# + meta_with_multiple_narratives_count = 0 + + workspaces_with_meta_cursor = db.workspaces.find({"meta" : {"$exists": True}, "ws" : {"$in":list_of_workspace_to_check}},{"ws":1,"meta":1}) + workspaces_with_meta_cursor_count = 0 + for workspace_with_meta in workspaces_with_meta_cursor: + workspaces_with_meta_cursor_count += 1 + narrative_ws_id = workspace_with_meta["ws"] + meta_narrative = None + for meta_element in workspace_with_meta["meta"]: +# print(" meta_element : " + str(meta_element)) + if "narrative" == meta_element["k"]: +# print("narrative in meta element") + if meta_narrative is None: + meta_narrative = meta_element["v"] + else: + if meta_narrative != meta_element["v"]: + meta_with_multiple_narratives_count += 1 +# print(" workspace_with_meta multiple narratives : " + str( workspace_with_meta["meta"])) + if meta_narrative is None: + meta_without_narrative_count += 1 + else: + try: + narrative_active_id_dict[narrative_ws_id] = int(meta_narrative) + except ValueError: +# del(narrative_active_id_dict[narrative_ws_id]) + narrative_obj_ids_not_int_dict[narrative_ws_id] = meta_narrative + #NOW NEED TO RESOLVE THE narrative id indicator that is not an integer: + for narrative_obj_id_not_int in narrative_obj_ids_not_int_dict: +# print("narrative_obj_id_not_int : " + str(narrative_obj_id_not_int)) +# print("narrative_obj_ids_not_int_dict[narrative_obj_id_not_int] : " + str(narrative_obj_ids_not_int_dict[narrative_obj_id_not_int])) + workspaceObjectsName_cursor = db.workspaceObjects.find({"ws": narrative_obj_id_not_int, + "name": narrative_obj_ids_not_int_dict[narrative_obj_id_not_int]}, + {"ws":1,"id":1}) + record_found = 0 + for workspaceObjectsName in workspaceObjectsName_cursor: + record_found = 1 + narrative_active_id_dict[narrative_obj_id_not_int] = workspaceObjectsName["id"] + if record_found == 0: + narrative_obj_ids_not_int_never_resolved[narrative_obj_id_not_int] = narrative_obj_ids_not_int_dict[narrative_obj_id_not_int] + +# print("workspaces_with_meta_cursor count : " + str(workspaces_with_meta_cursor_count)) +# print("meta_without_narrative_count : " + str(meta_without_narrative_count)) +# print("meta_with_multiple_narratives_count : " + str(meta_with_multiple_narratives_count)) +# print("narrative_obj_ids_not_int_never_resolved : " + str(narrative_obj_ids_not_int_never_resolved)) +# print("narrative_obj_ids_not_int_never_resolved length : " + str(len(narrative_obj_ids_not_int_never_resolved))) +# print("narrative_active_id_dict length : " + str(len(narrative_active_id_dict))) +# print("narrative_active_id_dict : " + str(narrative_active_id_dict)) +# print("narrative_type_list : " + str(narrative_type_list)) + +# exit() + + # key narrative id -> value comma delimited string of non_active_ids + return_non_active_ids_dict = dict() + + for narrative_with_active_id in narrative_active_id_dict: + # now determine which obj_ids are non-active narrative objects. + # confirm the number gotten back metches the count in (multiple_narrative_count_dict - 1) + non_active_narrative_ids_set = set() + narrative_obj_ids_cursor = db.workspaceObjVersions.find({ "ws": narrative_with_active_id, "type" : {"$in":narrative_type_list}},{"id":1, "ws":1, "_id":0}) + for narrative_obj_ids_row in narrative_obj_ids_cursor: + narrative_obj_id = narrative_obj_ids_row["id"] +# print("narrative_obj_id : " + str(narrative_obj_id)) + if narrative_obj_id != narrative_active_id_dict[narrative_with_active_id] : + non_active_narrative_ids_set.add(narrative_obj_id) + if len(non_active_narrative_ids_set) != (multiple_narrative_count_dict[narrative_with_active_id] - 1): + print("narrative_with_active_id : " + str(narrative_with_active_id) + " has a length of non_actives of " + str(len(non_active_narrative_ids_set)) + + " but the multiple_narrative_count_dict has a value of : " + str(multiple_narrative_count_dict[narrative_with_active_id]) + + " here are the non actives : " + str(non_active_narrative_ids_set)) + else: + return_non_active_ids_dict[narrative_with_active_id] = ",".join(str(x) for x in list(non_active_narrative_ids_set)) + + for return_non_active_id in return_non_active_ids_dict: + print(str(return_non_active_id) + "\t" + return_non_active_ids_dict[return_non_active_id]) +# print("return_non_active_ids_dict : " + str(return_non_active_ids_dict)) + + +narrative_type_list = get_narrative_typed_objects_list(cursor) +multiple_narrative_count_dict = get_multiple_narratives_count_dict(cursor) +get_non_active_narrative_object_ids(narrative_type_list, multiple_narrative_count_dict, db) + + diff --git a/source/custom_scripts/populate_orphaned_blobstore_nodes_handles.py b/source/custom_scripts/populate_orphaned_blobstore_nodes_handles.py new file mode 100644 index 0000000..1776779 --- /dev/null +++ b/source/custom_scripts/populate_orphaned_blobstore_nodes_handles.py @@ -0,0 +1,293 @@ +from pymongo import MongoClient +from pymongo import ReadPreference +from biokbase.workspace.client import Workspace +#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService +from biokbase.service.Client import Client as ServiceClient +import json as _json +import os +import mysql.connector as mysql +import requests +import time +#from splitting import split_sequence +from datetime import date +from datetime import datetime + +print("############################################") +print("############################################") +print("############################################") +print("START TIME (UTC): " + str(datetime.utcnow())) +start_time = time.time() + +requests.packages.urllib3.disable_warnings() + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] + +ws_url = os.environ["WS_URL"] +ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] +to_workspace = os.environ["WRK_SUFFIX"] + +to_blobstore = os.environ["BLOBSTORE_SUFFIX"] +to_handle_db = os.environ["HANDLE_DB_SUFFIX"] + + +client = MongoClient(mongoDB_metrics_connection + to_workspace) +db = client.workspace +handle_service_url = "https://kbase.us/services/handle_service" + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] +# connect to mysql +db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" +) +cursor = db_connection.cursor() +query = "use " + query_on +cursor.execute(query) + +#wsadmin = Workspace(ws_url, token=ws_user_token) +#hs = HandleService(handle_service_url, token=ws_user_token) + +def get_blobstore_nodes (): + client_blobstore = MongoClient(mongoDB_metrics_connection + to_blobstore) + db_blobstore = client_blobstore.blobstore + + blobstore_nodes_set = set() + blobstore_dict = dict() + + nodes_query = db_blobstore.nodes.find({},{"_id": 0, "id": 1, "own.user": 1, "time": 1}) + for record in nodes_query: + blobstore_node_id = record["id"] + user = "empty" + if "own" in record and "user" in record["own"]: + user = record["own"]["user"] + save_date = record["time"] + blobstore_nodes_set.add(blobstore_node_id) + blobstore_dict[blobstore_node_id] = {"user": user, + "date": save_date, + } + return (blobstore_nodes_set, blobstore_dict) + +def get_handles_and_blobstore_ids (): + client_handle_db = MongoClient(mongoDB_metrics_connection + to_handle_db) + db_handle = client_handle_db.handle_db + + handles_set = set() + handles_blobstore_ids_set = set() + handles_by_hid_dict = dict() + handles_by_bsid_dict = dict() + + handles_query = db_handle.handle.find({},{"_id": 0, "id": 1, "hid": 1, "created_by":1, "creation_date":1}) + for record in handles_query: + blobstore_id = record["id"] + handle = record["hid"] + user = record["created_by"] + save_date = record["creation_date"] + handles_set.add(handle) + handles_blobstore_ids_set.add(blobstore_id) + handles_by_hid_dict[handle] = {"bsid": blobstore_id, + "user": user, + "date": save_date, + } + handles_by_bsid_dict[blobstore_id] = {"handle" : handle, + "user": user, + "date": save_date, + } + + return (handles_set, handles_blobstore_ids_set, handles_by_hid_dict, handles_by_bsid_dict) + +def get_workspace_handles (): + workspace_handles_set = set() + workspace_dict = dict() + ws_obj_vers_cursor = db.workspaceObjVersions.find( + {#"ws":312, + "extids.handle" : { "$exists": True }}, + { + "type": 1, + "ws": 1, + "id": 1, + "ver": 1, + "extids": 1, + "savedate": 1, + "savedby": 1, + "_id": 0, + }, + no_cursor_timeout=True + ) + for ws_obj_ver in ws_obj_vers_cursor: + obj_type = ws_obj_ver["type"] + ws = ws_obj_ver["ws"] + obj_id = ws_obj_ver["id"] + ver = ws_obj_ver["ver"] + savedate = ws_obj_ver["savedate"] + savedby = ws_obj_ver["savedby"] + extids = ws_obj_ver["extids"] + handles = extids["handle"] + full_obj_id = str(ws) + "/" + str(obj_id) + "/" + str(ver) + for handle in handles: + (kbh_prefix, str_handle_id) = handle.split("_") + int_handle = int(str_handle_id) + workspace_handles_set.add(int_handle) + if int_handle not in workspace_dict : + workspace_dict[int_handle] = dict() + workspace_dict[int_handle][full_obj_id] = { "ws" : ws, + "date" : savedate, + "user" : savedby, + "type" : obj_type + } + return (workspace_handles_set, workspace_dict) + +(blobstore_nodes_set, blobstore_dict) = get_blobstore_nodes() +print("blobstore_nodes_set length : " + str(len(blobstore_nodes_set))) +(handles_set, handles_blobstore_ids_set, handles_by_hid_dict, handles_by_bsid_dict) = get_handles_and_blobstore_ids() +print("handles_set length : " + str(len(handles_set))) +print("handle_blobstore_ids_set length : " + str(len(handles_blobstore_ids_set))) +(workspace_handles_set, workspaces_dict) = get_workspace_handles() +print("workspace_handles_set length : " + str(len(workspace_handles_set))) + +blobstore_nodes_not_in_handles_set = blobstore_nodes_set.difference(handles_blobstore_ids_set) +handles_blobstores_not_in_blobstore_nodes = handles_blobstore_ids_set.difference(blobstore_nodes_set) + +handles_not_in_worspace_handles_set = handles_set.difference(workspace_handles_set) +workspace_handles_not_in_handles_set = workspace_handles_set.difference(handles_set) + + +wsov_handle_ids_not_in_handle_insert_cursor = db_connection.cursor(prepared=True) +wsov_handle_ids_not_in_handle_insert_statement = ( + "insert into metrics.wsov_handle_ids_not_in_handle " + "(ws_obj_ver_id, save_date, ws_id, handle_id, username, type) " + "values(%s, %s, %s, %s, %s, %s)" +) + + +for handle_id in workspace_handles_not_in_handles_set: + for full_obj_id in workspaces_dict[handle_id]: + ws_id = workspaces_dict[handle_id][full_obj_id]["ws"] + save_date = workspaces_dict[handle_id][full_obj_id]["date"] + user = workspaces_dict[handle_id][full_obj_id]["user"] + obj_type = workspaces_dict[handle_id][full_obj_id]["type"] + + input_vals = ( + full_obj_id, + save_date, + ws_id, + handle_id, + user, + obj_type, + ) + wsov_handle_ids_not_in_handle_insert_cursor.execute(wsov_handle_ids_not_in_handle_insert_statement, input_vals) + +handle_ids_not_in_ws_obj_ver_insert_cursor = db_connection.cursor(prepared=True) +handle_ids_not_in_ws_obj_ver_insert_statement = ( + "insert into metrics.handle_ids_not_in_ws_obj_ver " + "(blobstore_id, handle_id, username, save_date) " + "values(%s, %s, %s, %s) " +) + +for handle_id in handles_not_in_worspace_handles_set: + bsid = handles_by_hid_dict[handle_id]["bsid"] + user = handles_by_hid_dict[handle_id]["user"] + if user is None: + print("Entry for handle_id " + str(handle_id) + " :: " + str(handles_by_hid_dict[handle_id])) + user = "No User Found" + save_date = handles_by_hid_dict[handle_id]["date"] + input_vals = ( + bsid, + handle_id, + user, + save_date, + ) + handle_ids_not_in_ws_obj_ver_insert_cursor.execute(handle_ids_not_in_ws_obj_ver_insert_statement, input_vals) + +handles_blobstore_ids_not_in_nodes_insert_cursor = db_connection.cursor(prepared=True) +handles_blobstore_ids_not_in_nodes_insert_statement = ( + "insert into metrics.handles_blobstore_ids_not_in_nodes " + "(blobstore_id, handle_id, username, save_date) " + "values(%s, %s, %s, %s) " +) + +for bsid in handles_blobstores_not_in_blobstore_nodes: + handle_id = handles_by_bsid_dict[bsid]["handle"] + user = handles_by_bsid_dict[bsid]["user"] + if user is None: + print("Entry for bsid " + str(bsid) + " :: " + str(handles_by_bsid_dict[bsid])) + user = "No User Found" + save_date = handles_by_bsid_dict[bsid]["date"] + input_vals = ( + bsid, + handle_id, + user, + save_date, + ) + handles_blobstore_ids_not_in_nodes_insert_cursor.execute(handles_blobstore_ids_not_in_nodes_insert_statement, input_vals) + + +blobstore_ids_not_in_handle_insert_cursor = db_connection.cursor(prepared=True) +blobstore_ids_not_in_handle_insert_statement = ( + "insert into metrics.blobstore_ids_not_in_handle " + "(blobstore_id, username, save_date) " + "values(%s, %s, %s) " + ) + +for blobstore_id in blobstore_nodes_not_in_handles_set: + user = blobstore_dict[blobstore_id]["user"] + if user is None: + print("Entry for bsid " + str(blobstore_id) + " :: " + str(blobstore_dict[blobstore_id])) + user = "No User Found" + save_date = blobstore_dict[blobstore_id]["date"] + input_vals = ( + blobstore_id, + user, + save_date, + ) + blobstore_ids_not_in_handle_insert_cursor.execute(blobstore_ids_not_in_handle_insert_statement, input_vals) + + +i = 0 +print("Blobstore_dict :") +for bs_id in blobstore_dict: + i += 1 + if i > 4: + break + print("Blobstore : " + bs_id + " ::: " + str(blobstore_dict[bs_id])) + +i = 0 +print("handle_by_hid_dict :") +for hid in handles_by_hid_dict: + i += 1 + if i > 4: + break + print("Handle : " + str(hid) + " ::: " + str(handles_by_hid_dict[hid])) + +i = 0 +print("handle_by_bsid_dict :") +for bsid in handles_by_bsid_dict: + i += 1 + if i > 4: + break + print("BSID : " + str(bsid) + " ::: " + str(handles_by_bsid_dict[bsid])) + +i = 0 +print("workspaces_dict :") +for hid in workspaces_dict: + i += 1 + if i > 4: + break + print("Handle : " + str(hid) + " ::: " + str(workspaces_dict[hid])) + +print("blobstore_nodes_set length : " + str(len(blobstore_nodes_set))) +print("handle_blobstore_ids_set length : " + str(len(handles_blobstore_ids_set))) +print("handles_set length : " + str(len(handles_set))) +print("workspace_handles_set length : " + str(len(workspace_handles_set))) + +print("blobstore_nodes_not_in_handles_set length : " + str(len(blobstore_nodes_not_in_handles_set))) +print("handles_blobstores_not_in_blobstore_nodes length : " + str(len(handles_blobstores_not_in_blobstore_nodes))) +print("handles_not_in_worspace_handles_set length : " + str(len(handles_not_in_worspace_handles_set))) +print("workspace_handles_not_in_handles_set : " + str(len(workspace_handles_not_in_handles_set))) + +print("--- total seconds %s seconds ---" % (time.time() - start_time)) + +db_connection.commit() +db_connection.close() + +exit() diff --git a/source/custom_scripts/upload_get_copy_info_for_narratives.py b/source/custom_scripts/upload_get_copy_info_for_narratives.py new file mode 100644 index 0000000..251a830 --- /dev/null +++ b/source/custom_scripts/upload_get_copy_info_for_narratives.py @@ -0,0 +1,367 @@ +#import pymongo +from pymongo import MongoClient +from pymongo import ReadPreference +#from biokbase.workspace.client import Workspace +#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService +#from biokbase.service.Client import Client as ServiceClient +#import json as _json +import os +import mysql.connector as mysql +import requests +import time +#from splitting import split_sequence +from datetime import date +from datetime import datetime + +debug_mode = 1 + +if debug_mode == 1: + print("############################################") + print("############################################") + print("############################################") + print("START TIME (UTC): " + str(datetime.utcnow())) + +start_time = time.time() + +requests.packages.urllib3.disable_warnings() + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] + +ws_url = os.environ["WS_URL"] +ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] +to_workspace = os.environ["WRK_SUFFIX"] + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +metrics = os.environ["QUERY_ON"] + +# connect to mysql +db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" +) + +cursor = db_connection.cursor() +query = "use " + metrics +cursor.execute(query) + +client = MongoClient(mongoDB_metrics_connection + to_workspace) +db = client.workspace + +workspaces_without_corresponding_versions_data = list() + +#workspaces_with_multiple_narrative_obj_ids = dict() + +############################## +# +# get the list of narrative_typed objects +# +############################ +def get_narrative_typed_objects_list(cursor): + # get list of narrative typed objects on the system + query = ('select object_type, object_type_full from metrics_reporting.workspace_object_counts_current where object_type like "KBaseNarrative.Narrative%"') + cursor.execute(query) + narrative_type_list = list() + for row_values in cursor: + narrative_type_list.append(row_values[1]) + return narrative_type_list + +############################### +# +# Get a dict of Workspaces that contain a narrative - with its corresponding info +# {key: ws_id => {"id" => Object ID of the version 1 of the narratove, +# "savedate" => date that version 1 of the narrative was created. +# +############################### +def get_ws_narratives(db): +#def get_ws_narratives(db, narrative_type_list): + ws_narratives_dict = dict() +# workspaces_with_multiple_narrative_obj_ids = dict() + narrative_obj_ids_not_int_dict = dict() #key ws -> value the narrative value + narrative_obj_ids_not_int_never_resolved = dict() #key ws -> value the narrative value + + meta_without_narrative_count = 0 + meta_with_multiple_narratives_count = 0 + + workspaces_with_meta_cursor = db.workspaces.find({"meta" : {"$exists": True}},{"ws":1,"meta":1}) + for workspace_with_meta in workspaces_with_meta_cursor: + narrative_ws_id = workspace_with_meta["ws"] +# if narrative_ws_id != 100417: +# continue + meta_narrative = None +# print("narrative_ws_id : " + str(narrative_ws_id)) +# print(" workspace_with_meta meta : " + str( workspace_with_meta["meta"])) + for meta_element in workspace_with_meta["meta"]: +# print(" meta_element : " + str(meta_element)) + if "narrative" == meta_element["k"]: +# print("narrative in meta element") + if meta_narrative is None: + meta_narrative = meta_element["v"] + else: + if meta_narrative != meta_element["v"]: + meta_with_multiple_narratives_count += 1 +# print(" workspace_with_meta multiple narratives : " + str( workspace_with_meta["meta"])) + if meta_narrative is None: + meta_without_narrative_count += 1 + else: + ws_narratives_dict[narrative_ws_id] = dict() + try: + ws_narratives_dict[narrative_ws_id]["id"] = int(meta_narrative) + except ValueError: + del(ws_narratives_dict[narrative_ws_id]) + narrative_obj_ids_not_int_dict[narrative_ws_id] = meta_narrative + #NOW NEED TO RESOLVE THE narrative id indicator that is not an integer: + for narrative_obj_id_not_int in narrative_obj_ids_not_int_dict: +# print("narrative_obj_id_not_int : " + str(narrative_obj_id_not_int)) +# print("narrative_obj_ids_not_int_dict[narrative_obj_id_not_int] : " + str(narrative_obj_ids_not_int_dict[narrative_obj_id_not_int])) + workspaceObjectsName_cursor = db.workspaceObjects.find({"ws": narrative_obj_id_not_int, + "name": narrative_obj_ids_not_int_dict[narrative_obj_id_not_int]}, + {"ws":1,"id":1}) + record_found = 0 + for workspaceObjectsName in workspaceObjectsName_cursor: + record_found = 1 + ws_narratives_dict[narrative_obj_id_not_int] = dict() + ws_narratives_dict[narrative_obj_id_not_int]["id"] = workspaceObjectsName["id"] + if record_found == 0: + narrative_obj_ids_not_int_never_resolved[narrative_obj_id_not_int] = narrative_obj_ids_not_int_dict[narrative_obj_id_not_int] + narrative_obj_id_not_int + + print("meta_without_narrative_count : " + str(meta_without_narrative_count)) + print("meta_with_multiple_narratives_count : " + str(meta_with_multiple_narratives_count)) + print("ws_narratives_dict_length : " + str(len(ws_narratives_dict))) + print("narrative_obj_ids_not_int_never_resolved : " + str(narrative_obj_ids_not_int_never_resolved)) + print("narrative_obj_ids_not_int_never_resolved length : " + str(len(narrative_obj_ids_not_int_never_resolved))) + print("ws_narratives_dict length 1 : " + str(len(ws_narratives_dict))) + +# exit() + + processed_narratives_count = 0 + test_ws_narratives_dict = dict() + + # NOW DETERMINE THE SAVEDATE + for narrative_ws_id in ws_narratives_dict: + processed_narratives_count += 1 +# if processed_narratives_count < 140000: +# continue +# test_list = [ 13129,59769,56261,100417 ] +# if narrative_ws_id in test_list: + # NOW GET THE SAVE DATE FOR THE FIRST NARRATIVE VERSION +# print("Narrative ws id : " + str(narrative_ws_id)) +# obj_id = ws_narratives_dict[narrative_ws_id]["id"] +# print("id : " + str(ws_narratives_dict[narrative_ws_id]["id"])) + get_narrative_savedate_cursor = db.workspaceObjVersions.find({"ws": narrative_ws_id, "id":ws_narratives_dict[narrative_ws_id]["id"], "ver":1},{"ws":1, "id":1, "savedate":1, "_id":0}) + found_object_ver = 0 + for narrative_savedate_record in get_narrative_savedate_cursor: + ws_narratives_dict[narrative_ws_id]["savedate"] = narrative_savedate_record["savedate"] + found_object_ver = 1 +# test_ws_narratives_dict[narrative_ws_id] = ws_narratives_dict[narrative_ws_id] + if found_object_ver == 0: + workspaces_without_corresponding_versions_data.append(narrative_ws_id) + if processed_narratives_count % 1000 == 0: + print("Processed savedate for : " + str(processed_narratives_count) + " narratives") +# print("test_ws_narratives_dict : " + str(test_ws_narratives_dict)) +# print("test_ws_narratives_dict length 2: " + str(len(test_ws_narratives_dict))) +# print("test_ws_narratives_dict : " + str(test_ws_narratives_dict)) + + for ws_id_to_delete in workspaces_without_corresponding_versions_data: + del(ws_narratives_dict[ws_id_to_delete]) + print("ws_narratives_dict length 2: " + str(len(ws_narratives_dict))) + + return ws_narratives_dict + + +############################# +# +# Determine if the narrative was created from a copied operation +# Grab all ws_obj_versions that have a savedate <= the savedate of the first version of the narratoive object +# If all those objects have copied and from the same source WS, all are version 1, and all have a lower object id than the narrative object. +# Then it was copied from that WS. Now determine which version of that narrative was it copied from. +# Then look at versions of source narrative and take correct one with max date that is less than destination narrative savedate +# +############################# +def determine_if_narratives_are_copies(db, ws_narratives_dict, narrative_type_list): + ws_that_were_narrative_copy_list = list() + copied_ws_narratives_dict = dict() + source_ws_id_to_copied_ws_ids = dict() + + multiple_workspace_source_count = 0 + multiple_workspace_source_set = set() + fresh_narrative_count = 0 + not_all_pre_objects_copied_count = 0 + not_all_pre_objects_copied_set = set() + final_else_count = 0 + +# temp_ws_narratives_dict = dict() +# temp_ws_narratives_dict[103334] = ws_narratives_dict[103334] +# ws_narratives_dict = temp_ws_narratives_dict + + print("ws_narratives_dict length : " + str(len(ws_narratives_dict))) + + for potential_narrative_ws in sorted(ws_narratives_dict): + objects_to_check_count = 0 + objects_copied_count = 0 + workspace_ids_copied_from_set = set() + print("potential_narrative_ws : " + str(potential_narrative_ws) + " Dict: " + str(ws_narratives_dict[potential_narrative_ws])) + object_to_check_cursor = db.workspaceObjVersions.find({"savedate":{"$lt":ws_narratives_dict[potential_narrative_ws]["savedate"]}, + "ws":potential_narrative_ws}, + {"ws":1, "id":1, "copied":1,"savedate":1, "ver":1, "type":1, "_id":0}); + + for object_to_check in object_to_check_cursor: + object_type = object_to_check["type"] + if object_type in narrative_type_list: + # skip narrative objects + continue + copied_from = object_to_check["copied"] +# print("copied_from : " + str(copied_from)) + if copied_from is not None: + source_ws = int(copied_from.split("/")[0]) +# if objects_copied_count == 2: +# source_ws = 111 + workspace_ids_copied_from_set.add(source_ws) + objects_copied_count += 1 + objects_to_check_count += 1 + if objects_copied_count > 0 and (objects_to_check_count == objects_copied_count) and (len(workspace_ids_copied_from_set) == 1): + copied_ws_narratives_dict[potential_narrative_ws] = ws_narratives_dict[potential_narrative_ws] + source_ws_id = list(workspace_ids_copied_from_set)[0] + copied_ws_narratives_dict[potential_narrative_ws]["copied_from"] = source_ws_id + if source_ws_id not in source_ws_id_to_copied_ws_ids: + source_ws_id_to_copied_ws_ids[source_ws_id] = list() + source_ws_id_to_copied_ws_ids[source_ws_id].append(potential_narrative_ws) +# print("IT WAS COPIED : WS : " + str(potential_narrative_ws) + " copied from : " + str(workspace_ids_copied_from_set)) + elif len(workspace_ids_copied_from_set) > 1: +# print("NOT COPIED FROM ONE WS : " + str(workspace_ids_copied_from_set)) + multiple_workspace_source_count += 1 + multiple_workspace_source_set.add(potential_narrative_ws) + elif objects_copied_count == 0: +# print("This was a fresh narrative") + fresh_narrative_count += 1 + elif objects_copied_count != objects_to_check_count: +# print("Not all objectswere copied") + not_all_pre_objects_copied_count += 1 + not_all_pre_objects_copied_set.add(potential_narrative_ws) + else: +# print("Should hopefully never get here") + final_else_count += 1 + print("Processed ws : " + str(potential_narrative_ws)) +# print("copied_ws_narratives_dict : " + str(copied_ws_narratives_dict)) + + + + print("multiple_workspace_source_count : " + str(multiple_workspace_source_count)) + print("multiple_workspace_source_set : " + str(sorted(multiple_workspace_source_set))) + print("fresh_narrative_count : " + str(fresh_narrative_count)) + print("not_all_pre_objects_copied_count : " + str(not_all_pre_objects_copied_count)) + print("not_all_pre_objects_copied_set : " + str(sorted(not_all_pre_objects_copied_set))) + print("final_else_count : " + str(final_else_count)) + +# multiple_workspace_source_in_multi_narrative_count = 0 +# for temp_ws_id in multiple_workspace_source_set: +# if temp_ws_id in workspaces_with_multiple_narrative_obj_ids: +# multiple_workspace_source_in_multi_narrative_count += 1 +# print("multiple_workspace_source_in_multi_narrative_count : " + str(multiple_workspace_source_in_multi_narrative_count)) + +# not_all_pre_objects_copied_in_multi_narrative_count = 0 +# for temp_ws_id in not_all_pre_objects_copied_set: +# if temp_ws_id in workspaces_with_multiple_narrative_obj_ids: +# not_all_pre_objects_copied_in_multi_narrative_count += 1 +# print("not_all_pre_objects_copied_in_multi_narrative_count : " + str(not_all_pre_objects_copied_in_multi_narrative_count)) + + return (copied_ws_narratives_dict,source_ws_id_to_copied_ws_ids) + +def determine_source_narrative_version(db, copied_ws_narratives_dict, source_ws_id_to_copied_ws_ids, narrative_type_list): + destination_upa_from_source_upa_dict = dict() + returned_copied_ws_narratives_dict = dict() + unable_to_find_source_upa = list() + for source_ws_id in source_ws_id_to_copied_ws_ids: + ordered_save_points = list() + source_version_save_points_cursor = db.workspaceObjVersions.find({"type":{"$in":narrative_type_list}, + "ws":source_ws_id}, + {"id":1, "ver":1, "savedate":1, "_id":0}).sort("savedate") + for source_version_save_point in source_version_save_points_cursor: + source_obj_id = str(source_ws_id) + "/" + str(source_version_save_point["id"]) + "/" + str(source_version_save_point["ver"]) + savedate = source_version_save_point["savedate"] + ordered_save_points.append([savedate,source_obj_id]) + + for destination_ws_id in source_ws_id_to_copied_ws_ids[source_ws_id]: + destination_ws_savedate = copied_ws_narratives_dict[destination_ws_id]["savedate"] + source_obj_id_used = None + for ordered_save_point in ordered_save_points: + if ordered_save_point[0] <= destination_ws_savedate: + source_obj_id_used = ordered_save_point[1] + else: + break + if source_obj_id_used == None: + unable_to_find_source_upa.append(destination_ws_id) + else: + destination_upa = str(destination_ws_id) + "/" + str(copied_ws_narratives_dict[destination_ws_id]["id"]) + "/1" + destination_upa_from_source_upa_dict[destination_upa] = source_obj_id_used + returned_copied_ws_narratives_dict[destination_ws_id] = copied_ws_narratives_dict[destination_ws_id] + returned_copied_ws_narratives_dict[destination_ws_id]["destination_narrative_upa"] = destination_upa + returned_copied_ws_narratives_dict[destination_ws_id]["source_narrative_upa"] = source_obj_id_used + return (destination_upa_from_source_upa_dict,returned_copied_ws_narratives_dict) + +def upload_past_narrative_copies(returned_copied_ws_narratives_dict): + prep_cursor = db_connection.cursor(prepared=True) + past_narrative_copies_insert_statement = ( + "insert into past_narrative_copies " + "(source_narrative_id, source_narrative_upa, destination_narrative_id, destination_narrative_upa, destination_narrative_save_date) " + "values(%s, %s, %s, %s, %s);") + for copied_narrative_ws_id in returned_copied_ws_narratives_dict: + input = (returned_copied_ws_narratives_dict[copied_narrative_ws_id]['copied_from'], + returned_copied_ws_narratives_dict[copied_narrative_ws_id]['source_narrative_upa'], + copied_narrative_ws_id, + returned_copied_ws_narratives_dict[copied_narrative_ws_id]['destination_narrative_upa'], + returned_copied_ws_narratives_dict[copied_narrative_ws_id]['savedate']) + prep_cursor.execute(past_narrative_copies_insert_statement, input) + db_connection.commit() + +narrative_type_list = get_narrative_typed_objects_list(cursor) +#ws_narratives_dict = get_ws_narratives(db, narrative_type_list) +ws_narratives_dict = get_ws_narratives(db) +print("ws_narratives_dict length : " + str(len(ws_narratives_dict))) + +# NEED TO CODE UP AND WS ADMISNISTER TO DO AND THEN REPOPULATE WS_NARRATIVES DICT WITH THE PROPER NARRATIVE +# SEE methods_upload_workspace_stats line 337 to 339. +#ws_narratives_dict = cleanup_multiple_narrative_object_ids(db, ws_narratives_dict, workspaces_with_multiple_narrative_obj_ids) +(copied_ws_narratives_dict,source_ws_id_to_copied_ws_ids) = determine_if_narratives_are_copies(db, ws_narratives_dict, narrative_type_list) +(destination_upa_from_source_upa_dict,returned_copied_ws_narratives_dict) = determine_source_narrative_version(db, copied_ws_narratives_dict, source_ws_id_to_copied_ws_ids, narrative_type_list) +upload_past_narrative_copies(returned_copied_ws_narratives_dict) + +print("copied_ws_narratives_dict length : " + str(len(copied_ws_narratives_dict))) +print("source_ws_id_to_copied_ws_ids length : " + str(len(source_ws_id_to_copied_ws_ids))) +print("destination_upa_from_source_upa_dict length : " + str(len(destination_upa_from_source_upa_dict))) +print("workspaces_without_corresponding_versions_data : " + str(workspaces_without_corresponding_versions_data)) +print("workspaces_without_corresponding_versions_data length : " + str(len(workspaces_without_corresponding_versions_data))) + +i = 0 +for destination_upa in destination_upa_from_source_upa_dict : + if i < 5: + print(destination_upa + "\t" + destination_upa_from_source_upa_dict[destination_upa]) + else: + break + i += 1 + +print("returned_copied_ws_narratives_dict examples:") +i = 0 +for copied_narrative_ws_id in returned_copied_ws_narratives_dict: + if i < 5: + print(str(copied_narrative_ws_id) + "\t" + str(returned_copied_ws_narratives_dict[copied_narrative_ws_id])) + else: + break + i += 1 + +# loop through each of the sources, get all versions timestamps +# THen determine which version of the source for each distination copy event + + + + +################## +# +# Still need to do determination of which source narrative version. +# +# Need to do a reverse lookup object source_narrative_id -> [list of destination narratives] +# +##################### diff --git a/source/daily_cron_jobs/methods_upload_blobstore_details.py b/source/daily_cron_jobs/methods_upload_blobstore_details.py new file mode 100644 index 0000000..0ea3f5a --- /dev/null +++ b/source/daily_cron_jobs/methods_upload_blobstore_details.py @@ -0,0 +1,623 @@ +from pymongo import MongoClient +from pymongo import ReadPreference +from biokbase.workspace.client import Workspace +#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService +from biokbase.service.Client import Client as ServiceClient +import json as _json +import os +import mysql.connector as mysql +import methods_upload_user_stats +import requests +#import time +#from splitting import split_sequence +#from datetime import date +#from datetime +import datetime, time + +requests.packages.urllib3.disable_warnings() + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] + +ws_url = os.environ["WS_URL"] +ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] +to_workspace = os.environ["WRK_SUFFIX"] + +to_blobstore = os.environ["BLOBSTORE_SUFFIX"] +to_handle_db = os.environ["HANDLE_DB_SUFFIX"] + +client = MongoClient(mongoDB_metrics_connection + to_workspace) +db = client.workspace +handle_service_url = "https://kbase.us/services/handle_service" + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +yesterday = datetime.date.today() - datetime.timedelta(days=1) +start_time = time.time() + +################# +# +# Creates lookup for size by blobstore_id +# +################ +def make_blobstore_lookup (): + client_blobstore = MongoClient(mongoDB_metrics_connection + to_blobstore) + db_blobstore = client_blobstore.blobstore + + blobstore_nodes_size_lookup = dict() + + nodes_query = db_blobstore.nodes.find({},{"_id": 0, "id": 1, "size": 1}) + for record in nodes_query: + blobstore_node_id = record["id"] + size = record["size"] + blobstore_nodes_size_lookup[blobstore_node_id] = size + return blobstore_nodes_size_lookup + +################### +# +# Create a lookup for blobstore_id by handle_id +# +################### +def make_handle_id_lookup (): + client_handle_db = MongoClient(mongoDB_metrics_connection + to_handle_db) + db_handle = client_handle_db.handle_db + + handle_id_lookup = dict() + + handles_query = db_handle.handle.find({},{"_id": 0, "id": 1, "hid": 1}) + for record in handles_query: + blobstore_node_id = record["id"] + handle = record["hid"] + handle_id_lookup[handle] = blobstore_node_id + return handle_id_lookup + +#################### +# +# GETS EXISTING BLOBSTORE RECORDS to see if new insert needs to be done +# +################### +def get_existing_blobstore_details_records (db_connection): + existing_bs_details_cursor = db_connection.cursor(buffered=True) + existing_bs_details_statement = ("select blobstore_id, ws_obj_id, core_ws_obj_id, is_deleted from blobstore_detail") + existing_bs_details_cursor.execute(existing_bs_details_statement) + existing_records_set = set() + existing_deleted_blobstore_details_set = set() + for (blobstore_id, ws_obj_id, core_ws_obj_id, is_deleted) in existing_bs_details_cursor: + lookup_key = blobstore_id + "::" + ws_obj_id + existing_records_set.add(lookup_key) + if is_deleted == 1: + existing_deleted_blobstore_details_set.add(core_ws_obj_id) + return (existing_records_set, existing_deleted_blobstore_details_set) + +################# +# +# Lookup for the first save date for each blobstore id +# +################ +def get_existing_bsid_first_save_date (db_connection): + bsid_first_save_date_cursor = db_connection.cursor(buffered=True) + bsid_first_save_date_statement = ("select blobstore_id, min(save_date) as first_save_date from blobstore_detail group by blobstore_id") + bsid_first_save_date_cursor.execute(bsid_first_save_date_statement) + bsid_first_save_date_dict = {} + for (blobstore_id, first_save_date) in bsid_first_save_date_cursor: + bsid_first_save_date_dict[blobstore_id] = first_save_date + return bsid_first_save_date_dict + +################ +# +# Populates user_info table, this gets triggered when an user is not in the user_info table. +# This insures the foreign key does not fail. +# +################ +def populate_user_info_table(): + print("Blobstore refreshing of User Stats Upload (UTC)") + user_stats_dict = methods_upload_user_stats.get_user_info_from_auth2() + user_stats_dict = methods_upload_user_stats.get_internal_users(user_stats_dict) + user_stats_dict = methods_upload_user_stats.get_user_orgs_count(user_stats_dict) + user_stats_dict = methods_upload_user_stats.get_user_narrative_stats(user_stats_dict) + #user_stats_dict = methods_upload_user_stats.get_institution_and_country(user_stats_dict) + user_stats_dict = methods_upload_user_stats.get_profile_info(user_stats_dict) + print("--- gather data %s seconds ---" % (time.time() - start_time)) + methods_upload_user_stats.upload_user_data(user_stats_dict) + print("Refresh of Upload user stats completed") + +############## +# +# Creates set of usernames in user_info. this is used to make sure the username that is seen in the +# wsObjVersion is already in user_info table +# +############# +def get_usernames (db_connection): + usernames_cursor = db_connection.cursor(buffered=True) + usernames_statement = ("select username, user_id from metrics.user_info") + usernames_cursor.execute(usernames_statement) + temp_usernames_set = set() + for (username, user_id) in usernames_cursor: + temp_usernames_set.add(username) + print("Usernames length : " + str(len(temp_usernames_set))) + return temp_usernames_set + +############# +# +# creates set of deleted objects in the workspace collection +# +############ +#def get_deleted_workspace_objects_set(): +# deleted_objects = set() +# ws_obj_deleted_cursor = db.workspaceObjects.find({"del":True},{"_id":0, "ws": 1,"id":1}) +# for ws_obj_deleted in ws_obj_deleted_cursor: +# deleted_temp_ws_id = ws_obj_deleted["ws"] +# deleted_obj_id = ws_obj_deleted["id"] +# deleted_ws_obj_id = str(deleted_temp_ws_id) + "/" + str(deleted_obj_id) +# deleted_objects.add(deleted_ws_obj_id) +# return deleted_objects + +############## +# +# creates set of ws_obj_ids that also have a handle +# +############# +def get_deleted_workspace_objects_set(): + deleted_workspace_objects_set = set() + ws_obj_deleted_cursor = db.workspaceObjects.find({"del":True},{"_id":0, "ws": 1,"id":1}) + for ws_obj_deleted in ws_obj_deleted_cursor: + deleted_temp_ws_id = ws_obj_deleted["ws"] + deleted_obj_id = ws_obj_deleted["id"] + deleted_ws_obj_id = str(deleted_temp_ws_id) + "/" + str(deleted_obj_id) + deleted_workspace_objects_set.add(deleted_ws_obj_id) + + deleted_objects_with_handles_set = set() + ws_obj_vers_cursor = db.workspaceObjVersions.find( + { + "extids.handle" : { "$exists": True }, + }, + { + "ws": 1, + "id": 1, + "_id": 0, + }, + no_cursor_timeout=True + ) + for ws_obj_ver in ws_obj_vers_cursor: + object_id = str(ws_obj_ver["ws"]) + "/" + str(ws_obj_ver["id"]) + if object_id in deleted_workspace_objects_set: + deleted_objects_with_handles_set.add(object_id) + return (deleted_workspace_objects_set,deleted_objects_with_handles_set) + +############# +# +# creates set of deleted objects already in the blobstore_detail MySQL table +# +############ +def get_existing_blobstore_detail_ws_objects (db_connection): + deleted_ws_obj_cursor = db_connection.cursor(buffered=True) + deleted_ws_obj_statement = ("select core_ws_obj_id, is_deleted from metrics.blobstore_detail where is_deleted = 1") + deleted_ws_obj_cursor.execute(deleted_ws_obj_statement) + existing_bs_deleted_objects_set = set() + for (core_ws_obj_id, is_deleted) in deleted_ws_obj__cursor: + existing_bs_deleted_objects_set.add(core_ws_obj_id) + print("Existing Blobstore deleted ws_obj set length : " + str(len(existing_bs_deleted_objects_set))) + return existing_bs_deleted_objects_set + + + +############ +# +# Gets all the blobsgtore information and uploads it into the blobstre_details table +# Defaults to previous full if start_date and end_date is passed +# Allows for backfilling records if specific dates are chosen +# Note this contains logic to insure all users are user_info +# It will duplicate existing records (so it is safe to use a datge range previously done +# It will always figure out what was the original saver object for a blobstore based on the records present +# in the upload and existing records in the blobstore details table +############ +def upload_blobstore_details_data(start_date, end_date): + """ + Upload blobstore_date + """ + # object_id -> {handle=>handle, node=node, type=object_type, savedate=> sd} + objects_with_problem_nodes_with_no_size = dict() + objects_with_problem_handles_with_no_nodes = dict() + + running_size_total = 0 + + deleted_object_with_data_found_count = 0 + deleted_object_without_data_found_count = 0 + deleted_object_without_data_found_set = set() + + #exit() + + # blobstore_id => {ws_obj_id => (save_date, saver)} + blobstore_object_results = dict() + + # blobstore_id =>{first_saver_ws_obj_id => blah, + # first_save_date = date} + #blobstore_id_first_saver = dict() + + #ws_ids = [146324] # small + #ws_ids = [28129] # fungal phytosome s + #ws_ids = [146324,28129] # fungal phytosome and small ws, took 203 mins + #ws_ids = [19217] # refseq reference + +# #for ws_id in ws_ids: +# deleted_objects = set() +# ws_obj_deleted_cursor = db.workspaceObjects.find({"del":True},{"_id":0, "ws": 1,"id":1}) +# for ws_obj_deleted in ws_obj_deleted_cursor: +# deleted_temp_ws_id = ws_obj_deleted["ws"] +# deleted_obj_id = ws_obj_deleted["id"] +# deleted_ws_obj_id = str(deleted_temp_ws_id) + "/" + str(deleted_obj_id) +# deleted_objects.add(deleted_ws_obj_id) + +# deleted_workspace_objects = get_deleted_workspace_objects_set() + (deleted_workspace_objects, deleted_objects_with_handles_set) = get_deleted_workspace_objects_set() + + print("TOTAL DELETED OBJECT LENGTH: " + str(len(deleted_workspace_objects))) + print("TOTAL DELETED OBJECT LENGTH: " + str(len(deleted_objects_with_handles_set))) + print("--- total time for the deleted objects lookup %s seconds ---" % (time.time() - start_time)) + + ws_obj_vers_cursor = db.workspaceObjVersions.find( + {#"ws":312, + "extids.handle" : { "$exists": True }, + "savedate": {"$gt": start_date, "$lt": end_date}, + }, + { + "type": 1, + "ws": 1, + "id": 1, + "ver": 1, + "savedate": 1, + "savedby": 1, + "extids": 1, + "_id": 0, + }, + no_cursor_timeout=True + ) + i = 0 + ws_obj_info = dict() + deleted_ext_ids_counter = 0 + + for ws_obj_ver in ws_obj_vers_cursor: + is_deleted = 0 + object_type_full = ws_obj_ver["type"] + (object_type, object_spec_version) = object_type_full.split("-") + #if (object_type != "KBaseNarrative.Narrative" and object_type != "KBaseReport.Report"): + ws_id = ws_obj_ver["ws"] + obj_id = ws_obj_ver["id"] + temp_ws_obj_id = str(ws_id) + "/" + str(obj_id) + if temp_ws_obj_id in deleted_workspace_objects: + deleted_ext_ids_counter += 1 + is_deleted = 1 + # continue + obj_ver = ws_obj_ver["ver"] + obj_save_date = ws_obj_ver["savedate"] + savedby = ws_obj_ver["savedby"] + extids = ws_obj_ver["extids"] + handles = extids["handle"] +# for handle in handles: +# handles_set.add(handle) +# obj_copied = 0 + full_obj_id = str(ws_id) + "/" + str(obj_id) + "/" + str(obj_ver) +# print("Full obj id : " + full_obj_id) +# print("Object Type : " + object_type_full) +# if (object_type != "KBaseNarrative.Narrative" and object_type != "KBaseReport.Report"): +# if (object_type == "KBaseNarrative.Narrative" or object_type == "KBaseReport.Report"): + + ws_obj_info[full_obj_id] = {"save_date" : obj_save_date, + "savedby" : savedby, + "obj_type" : object_type_full, + "handles" : handles, + "is_deleted" : is_deleted} + + print("--- total time for the ws_object_version objects query %s seconds ---" % (time.time() - start_time)) + + ########################################################################## + print("BLOBSTORE LOOKUP:") + blobstore_lookup = make_blobstore_lookup() +# test_counter = 0 +# for temp_key in blobstore_lookup: +# if test_counter < 10: +# print("ID: " + str(temp_key) + " ::: size: " + str(blobstore_lookup[temp_key])) +# else: +# break +# test_counter = test_counter + 1 + print("Total BLOBSTORE Lookuplength: " + str(len(blobstore_lookup))) + + print("--- total time for the blobstore size lookup creation %s seconds ---" % (time.time() - start_time)) + + handle_id_lookup = make_handle_id_lookup() +# test_counter = 0 +# for temp_key in handle_id_lookup: +# if test_counter < 10: +# print("ID: " + str(temp_key) + " ::: blobstore_id: " + str(handle_id_lookup[temp_key])) +# else: +# break +# test_counter = test_counter + 1 + print("Total HANDLE ID lookup length: " + str(len(handle_id_lookup))) + + print("--- total time for the blobstore size lookup creation %s seconds ---" % (time.time() - start_time)) +############################################## + + for full_obj_id in ws_obj_info: +# print("ws_obj_info[full_obj_id][handles] : " + str(ws_obj_info[full_obj_id]["handles"])) + for handle in ws_obj_info[full_obj_id]["handles"]: + blobstore_id = None + (kbh_prefix, str_handle_id) = handle.split("_") + if int(str_handle_id) in handle_id_lookup: + blobstore_id = handle_id_lookup[int(str_handle_id)] + else: + objects_with_problem_handles_with_no_nodes[full_obj_id] = ws_obj_info[full_obj_id] + if ws_obj_info[full_obj_id]["is_deleted"] == 1: + deleted_object_without_data_found_count += 1 + (temp_core_object_id, temp_ver) = full_obj_id.rsplit("/",1) + deleted_object_without_data_found_set.add(temp_core_object_id) + + if blobstore_id and blobstore_id in blobstore_lookup: + if blobstore_id not in blobstore_object_results: + blobstore_object_results[blobstore_id] = dict() + blobstore_object_results[blobstore_id][full_obj_id] = (ws_obj_info[full_obj_id]["save_date"], + ws_obj_info[full_obj_id]["savedby"]) +# print("Blobstore lookup file_size : " + str(blobstore_lookup[blobstore_id])) + if ws_obj_info[full_obj_id]["is_deleted"] == 1: + deleted_object_with_data_found_count += 1 + file_size = blobstore_lookup[blobstore_id] + running_size_total = running_size_total + file_size + else: +# print("HUGE PROBLEM: obj_id : " + full_obj_id + " blobstore_id: " + str(blobstore_id) + " IS NOT IN THE LOOKUP") +# del blobstore_object_results[blobstore_id] + objects_with_problem_nodes_with_no_size[full_obj_id] = ws_obj_info[full_obj_id] + if ws_obj_info[full_obj_id]["is_deleted"] == 1: + deleted_object_without_data_found_count += 1 + (temp_core_object_id, temp_ver) = full_obj_id.rsplit("/",1) + deleted_object_without_data_found_set.add(temp_core_object_id) + + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) +# update_zero_orig_saver_cursor = db_connection.cursor(prepared=True) +# blobstore_detail_zero_orig_saver_update_statement = ( +# "update metrics.blobstore_detail " +# "set orig_saver = 0 where blobstore_id = %s;" +# ) + +# update_cursor = db_connection.cursor(prepared=True) +# blobstore_detail_update_statement = ( +# "update metrics.blobstore_detail " +# "set orig_saver = 1 where blobstore_id = %s and ws_obj_id = %s;" +# ) + + bsid_first_save_date_dict = get_existing_bsid_first_save_date(db_connection) + existing_blobstore_records, existing_deleted_blobstore_details_set = get_existing_blobstore_details_records(db_connection) + usernames_set = get_usernames(db_connection) + print("Usernames length = " + str(len(usernames_set))) + db_connection.close() + + insert_count = 0 + needed_existing_update_orig_saver_count = 0 + skip_insert_because_exists_count = 0 + +# loop over all the blobstore details and pull together all the needed information and do the inserts + for blobstore_id in blobstore_object_results: + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + bsid_new_first_save_date = None + bsid_new_first_save_date_ws_obj_id = None + existing_bsid_first_save_date = None + insert_cursor = db_connection.cursor(prepared=True) + blobstore_detail_insert_statement = ( + "insert into metrics.blobstore_detail " + "(blobstore_id, ws_obj_id, save_date, ws_id, size, saver_username, orig_saver, object_type, core_ws_obj_id) " + "values(%s, %s, %s, %s, %s, %s, 0, %s, %s)" + ) + + update_zero_orig_saver_cursor = db_connection.cursor(prepared=True) + blobstore_detail_zero_orig_saver_update_statement = ( + "update metrics.blobstore_detail " + "set orig_saver = 0 where blobstore_id = %s;" + ) + + update_cursor = db_connection.cursor(prepared=True) + blobstore_detail_update_statement = ( + "update metrics.blobstore_detail " + "set orig_saver = 1 where blobstore_id = %s and ws_obj_id = %s;" + ) + + had_a_reference_ws = 0 + if blobstore_id in bsid_first_save_date_dict: + existing_bsid_first_save_date = bsid_first_save_date_dict[blobstore_id] + for full_ws_obj_id in blobstore_object_results[blobstore_id]: + (ws_id, obj_id, version_number) = full_ws_obj_id.split("/") + save_date = blobstore_object_results[blobstore_id][full_ws_obj_id][0] + saver = blobstore_object_results[blobstore_id][full_ws_obj_id][1] + + lookup_key = blobstore_id + "::" + full_ws_obj_id + if lookup_key in existing_blobstore_records: + skip_insert_because_exists_count += 1 + continue + + # TO GET ONLY REFERENCE GENOME WORKSPACES +# if int(ws_id) in (19217, 16026, 28129, 80490): +# had_a_reference_ws = 1 + # DO INSERT SET ORIG_SAVER = 0 + + if saver not in usernames_set: + print("Usernames pre length = " + str(len(usernames_set))) + populate_user_info_table() + usernames_set = get_usernames(db_connection) + print("Usernames post length = " + str(len(usernames_set))) + + size = blobstore_lookup[blobstore_id] + object_type = ws_obj_info[full_ws_obj_id]["obj_type"] + temp = full_ws_obj_id.split("/") + core_ws_obj_id = "/".join(temp[:-1]) + + input_vals = ( + blobstore_id, + full_ws_obj_id, + save_date, + ws_id, + size, + saver, + object_type, + core_ws_obj_id, + ) + insert_cursor.execute(blobstore_detail_insert_statement, input_vals) + insert_count += 1 + + # record is fresh and needs to be inserted. + #DO SAVE DATE LOGIC LOOKING FOR MIN_DATE + if (existing_bsid_first_save_date and save_date < existing_bsid_first_save_date): + bsid_new_first_save_date = save_date + bsid_new_first_save_date_ws_obj_id = full_ws_obj_id + if existing_bsid_first_save_date is None: + if (bsid_new_first_save_date is None or save_date < bsid_new_first_save_date): + bsid_new_first_save_date = save_date + bsid_new_first_save_date_ws_obj_id = full_ws_obj_id + + +# if had_a_reference_ws == 1: + # AFTER ALL THE INSERTS DONE (update the record that is now the min_date, potentially change min_date from an existing or-ig_saver + if existing_bsid_first_save_date is not None and bsid_new_first_save_date is not None: + #meand a new seen record has lower save date than an existing one. Should not occur. + update_vals = (blobstore_id,) + update_zero_orig_saver_cursor.execute(blobstore_detail_zero_orig_saver_update_statement, update_vals) + needed_existing_update_orig_saver_count += 1 + if bsid_new_first_save_date_ws_obj_id is not None: + update_cursor = db_connection.cursor(prepared=True) + blobstore_detail_update_statement = ( + "update metrics.blobstore_detail " + "set orig_saver = 1 where blobstore_id = %s and ws_obj_id = %s;" + ) + update_vals = (blobstore_id, bsid_new_first_save_date_ws_obj_id) + update_cursor.execute(blobstore_detail_update_statement, update_vals) + insert_cursor.close() + db_connection.commit() + db_connection.close() + + # RESOLVE THE MISSING DELETED OBJECTS + deleted_objects_to_update_set = deleted_objects_with_handles_set.difference(existing_deleted_blobstore_details_set) + if len(deleted_objects_to_update_set) > 0: + print("Length of core obj ids that need to be marked as deleted : " + str(len(deleted_objects_to_update_set))) + print("length of deleted_object_without_data_found_set : " + str(len(deleted_object_without_data_found_set))) + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + update_deleted_objects_cursor = db_connection.cursor() + update_deleted_objects_statement = ("update metrics.blobstore_detail set is_deleted = 1 where core_ws_obj_id = %s;") + for core_deleted_obj_id in deleted_objects_to_update_set: + update_deleted_objects_vals = (core_deleted_obj_id,) + update_deleted_objects_cursor.execute(update_deleted_objects_statement, update_deleted_objects_vals) + update_deleted_objects_cursor.close + db_connection.commit() + db_connection.close() + + # UNDELETE THE OBJECTS THAT HAVE BEEN UNDELETED + undeleted_objects_to_update_set = existing_deleted_blobstore_details_set.difference(deleted_objects_with_handles_set) + if len(undeleted_objects_to_update_set) > 0: + print("Length of core obj ids that need to be marked as undeleted : " + str(len(undeleted_objects_to_update_set))) + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + update_undeleted_objects_cursor = db_connection.cursor() + update_undeleted_objects_statement = ("update metrics.blobstore_detail set is_deleted = 0 where core_ws_obj_id = %s;") + for core_undeleted_obj_id in undeleted_objects_to_update_set: + update_undeleted_objects_vals = (core_undeleted_obj_id,) + update_undeleted_objects_cursor.execute(update_undeleted_objects_statement, update_undeleted_objects_vals) + update_undeleted_objects_cursor.close + db_connection.commit() + db_connection.close() + + + #print("objects_with_problem_nodes_with_no_size : " + str(objects_with_problem_nodes_with_no_size)) + print("TOTAL objects_with_problem_nodes_with_no_size : " + str(len(objects_with_problem_nodes_with_no_size))) + + #print("objects_with_problem_handles_with_no_nodes : " + str(objects_with_problem_handles_with_no_nodes)) + print("TOTAL objects_with_problem_handles_with_no_nodes : " + str(len(objects_with_problem_handles_with_no_nodes))) + + print("deleted_object_with_data_found_count :" + str(deleted_object_with_data_found_count)) + print("deleted_object_without_data_found_count :" + str(deleted_object_without_data_found_count)) + +# print("blobstore_object_results : " + str(blobstore_object_results)) +# for blobstore_id in blobstore_object_results: +# if len( blobstore_object_results[blobstore_id]) > 5: +# print("blobstore ID : " + str(blobstore_id)) +# print(str(blobstore_object_results[blobstore_id])) + print("blobstore_object_results length : " + str(len(blobstore_object_results))) + print("RUNNING TOTAL SIZE : " + str(running_size_total)) + + obj_id_set = set() + for blobstore_id in blobstore_object_results : + for obj_id in blobstore_object_results[blobstore_id]: + obj_id_set.add(obj_id) + print("Total number of objects with handles that could be fully determined : " + str(len(obj_id_set))) + + print("Total ext_ids objects that were deleted : " + str(deleted_ext_ids_counter)) + + #print("blobstore_object_results : " + str(blobstore_object_results)) + + + print("Insert Count = " + str(insert_count)) + print("needed_existing_update_orig_saver_count = " + str(needed_existing_update_orig_saver_count)) + print("skip_insert_because_exists_count = " + str(skip_insert_because_exists_count)) + + print("--- total seconds %s seconds ---" % (time.time() - start_time)) + #db_connection.commit() + #db_connection.cLOSE() + + #################### + # END upload_blobstore_details_data + ################### + + + +##################### +# +# Essentially the main caller program that deals with start and end date information +# Whether there were passed values or the defaut of the previous full day +# +#################### +def process_blobstore_details_data( + start_date=datetime.datetime.combine(yesterday, datetime.datetime.min.time()), + end_date=datetime.datetime.combine(yesterday, datetime.datetime.max.time()), + ): + # get mongo set up + # client_blobstore = MongoClient(mongoDB_metricsro_connection + to_blobstore) + client_blobstore = MongoClient(mongoDB_metrics_connection + to_blobstore) + db_blobstore = client_blobstore.blobstore + + print("############################################") + print("START TIME (UTC): " + str(datetime.datetime.utcnow())) + start_time = time.time() + + # From str to datetime, defaults to zero time. + if type(start_date) == str: + start_date_partial = datetime.datetime.strptime(start_date, "%Y-%m-%d") + start_date = datetime.datetime.combine( + start_date_partial, datetime.datetime.min.time() + ) + end_date_partial = datetime.datetime.strptime(end_date, "%Y-%m-%d") + end_date = datetime.datetime.combine( + end_date_partial, datetime.datetime.max.time() + ) + + print("Start date : " + str(start_date)) + print("End date : " + str(end_date)) + + upload_blobstore_details_data(start_date, end_date) + print("############################################") +#exit() diff --git a/source/daily_cron_jobs/upload_blobstore_details.py b/source/daily_cron_jobs/upload_blobstore_details.py new file mode 100644 index 0000000..a30d1c4 --- /dev/null +++ b/source/daily_cron_jobs/upload_blobstore_details.py @@ -0,0 +1,28 @@ +# UploadBlobstoreDetails +# +import methods_upload_blobstore_details +import time +import datetime + +yesterday = datetime.date.today() - datetime.timedelta(days=1) +print("############################################") +print("############################################") +print("############################################") +print("Blobstore Detais Upload (UTC): " + str(datetime.datetime.utcnow())) +print("START TIME (UTC): " + str(datetime.datetime.utcnow())) +start_time = time.time() + + +start_time = time.time() +start_date = "2024-09-07" +end_date = "2024-09-28" +methods_upload_blobstore_details.process_blobstore_details_data(start_date,end_date) +#methods_upload_blobstore_details.process_blobstore_details_data() +#print("Uploading blobstore details took ", time.time() - start_time, " seconds to run") + + +start_date=datetime.datetime.combine(yesterday, datetime.datetime.min.time()) +end_date=datetime.datetime.combine(yesterday, datetime.datetime.max.time()) + +print("Start date: " + str(start_date)) +print("End date: " + str(end_date)) From 3a6d0859dab9f14bd9200b4d3c23e0942b76b7fe Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Fri, 4 Oct 2024 02:56:29 +0000 Subject: [PATCH 04/11] added blobstore detail information to user_super_summary --- source/custom_scripts/dump_query_results.py | 4 +++- source/daily_cron_jobs/make_reporting_tables.py | 6 ++++++ sql_create_statements/sql_reporting_views_and_tables.sql | 8 +++++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/source/custom_scripts/dump_query_results.py b/source/custom_scripts/dump_query_results.py index ab8486c..ba42ea1 100644 --- a/source/custom_scripts/dump_query_results.py +++ b/source/custom_scripts/dump_query_results.py @@ -42,7 +42,9 @@ def dump_query_results(): "last_narrative_modified_date\ttotal_narrative_objects_count\ttop_lvl_narrative_objects_count\ttotal_narrative_objects_size\t" "top_lvl_narrative_objects_size\ttotal_narrative_count\ttotal_public_narrative_count\tdistinct_static_narratives_count\t" "static_narratives_created_count\ttotal_visible_app_cells\ttotal_code_cells_count\tfirst_file_date\tlast_file_date\t" - "total_file_sizes_MB\ttotal_file_count\tmost_used_app\tdistinct_apps_used\ttotal_apps_run_all_time\ttotal_apps_run_last365\t" + "total_file_sizes_MB\ttotal_file_count\tblobstore_orig_saver_count\tblobstore_non_orig_saver_count\t" + "blobstore_orig_saver_size_GB\tblobstore_non_orig_saver_size_GB\t" + "most_used_app\tdistinct_apps_used\ttotal_apps_run_all_time\ttotal_apps_run_last365\t" "total_apps_run_last90\ttotal_apps_run_last30\ttotal_app_errors_all_time\tfirst_app_run\tlast_app_run\ttotal_run_time_hours\t" "total_queue_time_hours\ttotal_CPU_hours\tsession_count_all_time\tsession_count_last_year\tsession_count_last_90\tsession_count_last_30" ) diff --git a/source/daily_cron_jobs/make_reporting_tables.py b/source/daily_cron_jobs/make_reporting_tables.py index 69f8024..8bade7b 100644 --- a/source/daily_cron_jobs/make_reporting_tables.py +++ b/source/daily_cron_jobs/make_reporting_tables.py @@ -302,6 +302,10 @@ def make_reporting_tables(): "uns.total_visible_app_cells, uns.total_code_cells_count, " "bus.first_file_date, bus.last_file_date, " "bus.total_file_sizes_MB, bus.total_file_count, " + "bdu.orig_saver_count as blobstore_orig_saver_count, " + "bdu.non_orig_saver_count as blobstore_non_orig_saver_count, " + "bdu.orig_saver_size_GB as blobstore_orig_saver_size_GB, " + "bdu.non_orig_saver_size_GB as blobstore_non_orig_saver_size_GB, " "umua.mu_func_name as most_used_app, " "udauc.distinct_apps_used, " "uapc.total_apps_run_all_time, uapc.total_apps_run_last365, " @@ -337,6 +341,8 @@ def make_reporting_tables(): "on uip.username = usc90.username " "left outer join metrics.hv_user_session_count_last_30 usc30 " "on uip.username = usc30.username " + "left outer join metrics.blobstore_detail_by_user bdu " + "on uip.username = bdu.saver_username " "where uip.exclude != 1 ") cursor.execute(user_super_summary_create_statement) print("user_super_summary_create_statement created") diff --git a/sql_create_statements/sql_reporting_views_and_tables.sql b/sql_create_statements/sql_reporting_views_and_tables.sql index 1d5c016..294cf9c 100644 --- a/sql_create_statements/sql_reporting_views_and_tables.sql +++ b/sql_create_statements/sql_reporting_views_and_tables.sql @@ -1475,7 +1475,11 @@ uns.total_narrative_count, uns.total_public_narrative_count, uns.distinct_static_narratives_count, uns.static_narratives_created_count, uns.total_visible_app_cells, uns.total_code_cells_count, bus.first_file_date, bus.last_file_date, -bus.total_file_sizes_MB, bus.total_file_count, +bus.total_file_sizes_MB, bus.total_file_count, +bdu.orig_saver_count as blobstore_orig_saver_count, +bdu.non_orig_saver_count as blobstore_non_orig_saver_count, +bdu.orig_saver_size_GB as blobstore_orig_saver_size_GB, +bdu.non_orig_saver_size_GB as blobstore_non_orig_saver_size_GB, umua.mu_func_name as most_used_app, udauc.distinct_apps_used, uapc.total_apps_run_all_time, uapc.total_apps_run_last365, @@ -1511,6 +1515,8 @@ left outer join metrics.hv_user_session_count_last_90 usc90 on uip.username = usc90.username left outer join metrics.hv_user_session_count_last_30 usc30 on uip.username = usc30.username +left outer join metrics.blobstore_detail_by_user bdu +on uip.username = bdu.saver_username where uip.exclude != 1; # END OF USER_SUPER_SUMMARY From 99e219639d0f9105bfefbf4104e57ac246bae2b9 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Fri, 4 Oct 2024 03:00:52 +0000 Subject: [PATCH 05/11] added blobstore detail information to user_super_summary --- .../daily_cron_jobs/make_reporting_tables.py | 119 +++++++++--------- 1 file changed, 58 insertions(+), 61 deletions(-) diff --git a/source/daily_cron_jobs/make_reporting_tables.py b/source/daily_cron_jobs/make_reporting_tables.py index 8bade7b..cf1a0ed 100644 --- a/source/daily_cron_jobs/make_reporting_tables.py +++ b/source/daily_cron_jobs/make_reporting_tables.py @@ -185,6 +185,64 @@ def make_reporting_tables(): cursor.execute(narrative_app_flows_create_statement) print("narrative_app_flows created") + # Blobstroe detial related tables + blobstore_detail_by_ws_create_statement = ( + "create or replace table blobstore_detail_by_ws as " + "(select in_q.ws_id, sum(in_q.orig_saver_count) as orig_saver_count, " + "sum(in_q.non_orig_saver_count) as non_orig_saver_count, " + "sum(in_q.orig_saver_size_GB) as orig_saver_size_GB, " + "sum(in_q.non_orig_saver_size_GB) as non_orig_saver_size_GB, " + "sum(in_q.total_blobstore_size_GB) as total_blobstore_size_GB " + "from (" + "select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by ws_id, month) in_q " + "group by ws_id ) ") + cursor.execute(blobstore_detail_by_ws_create_statement) + print("blobstore_detail_by_ws_create_statement created") + + blobstore_detail_by_user_monthly_create_statement = ( + "create or replace table blobstore_detail_by_user_monthly as " + "(select saver_username, DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by saver_username, month) ") + cursor.execute(blobstore_detail_by_user_monthly_create_statement) + print("blobstore_detail_by_user_monthly_create_statement created") + + blobstore_detail_by_user_create_statement = ( + "create or replace table blobstore_detail_by_user as " + "(select saver_username, " + "sum(orig_saver_count) as orig_saver_count, sum(non_orig_saver_count) as non_orig_saver_count, " + "sum(orig_saver_size_GB) as orig_saver_size_GB, " + "sum(non_orig_saver_size_GB) as non_orig_saver_size_GB, " + "sum(total_blobstore_size_GB) as total_blobstore_size_GB " + "from blobstore_detail_by_user_monthly " + "group by saver_username) ") + cursor.execute(blobstore_detail_by_user_create_statement) + print("blobstore_detail_by_user_create_statement created") + + blobstore_detail_by_object_type_monthly_create_statement = ( + "create or replace table blobstore_detail_by_object_type_monthly as " + "(select LEFT(object_type,LOCATE('-',object_type) - 1) as object_type, " + "DATE_FORMAT(`save_date`,'%Y-%m') as month, " + "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " + "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " + "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " + "sum(size)/1000000000 as total_blobstore_size_GB " + "from blobstore_detail bd " + "group by object_type, month) ") + cursor.execute(blobstore_detail_by_object_type_monthly_create_statement) + print("blobstore_detail_by_object_type_monthly_create_statement created") + + ################## # a whole bunch of tables related user_super_summary (some helpers that can also be used stadn alone) ################## @@ -348,67 +406,6 @@ def make_reporting_tables(): print("user_super_summary_create_statement created") - # Blobstroe detial related tables - blobstore_detail_by_ws_create_statement = ( - "create or replace table blobstore_detail_by_ws as " - "(select in_q.ws_id, sum(in_q.orig_saver_count) as orig_saver_count, " - "sum(in_q.non_orig_saver_count) as non_orig_saver_count, " - "sum(in_q.orig_saver_size_GB) as orig_saver_size_GB, " - "sum(in_q.non_orig_saver_size_GB) as non_orig_saver_size_GB, " - "sum(in_q.total_blobstore_size_GB) as total_blobstore_size_GB " - "from (" - "select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, " - "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " - "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " - "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " - "sum(size)/1000000000 as total_blobstore_size_GB " - "from blobstore_detail bd " - "group by ws_id, month) in_q " - "group by ws_id ) ") - cursor.execute(blobstore_detail_by_ws_create_statement) - print("blobstore_detail_by_ws_create_statement created") - - blobstore_detail_by_user_monthly_create_statement = ( - "create or replace table blobstore_detail_by_user_monthly as " - "(select saver_username, DATE_FORMAT(`save_date`,'%Y-%m') as month, " - "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " - "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " - "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " - "sum(size)/1000000000 as total_blobstore_size_GB " - "from blobstore_detail bd " - "group by saver_username, month) ") - cursor.execute(blobstore_detail_by_user_monthly_create_statement) - print("blobstore_detail_by_user_monthly_create_statement created") - - blobstore_detail_by_user_create_statement = ( - "create or replace table blobstore_detail_by_user as " - "(select saver_username, " - "sum(orig_saver_count) as orig_saver_count, sum(non_orig_saver_count) as non_orig_saver_count, " - "sum(orig_saver_size_GB) as orig_saver_size_GB, " - "sum(non_orig_saver_size_GB) as non_orig_saver_size_GB, " - "sum(total_blobstore_size_GB) as total_blobstore_size_GB " - "from blobstore_detail_by_user_monthly " - "group by saver_username) ") - cursor.execute(blobstore_detail_by_user_create_statement) - print("blobstore_detail_by_user_create_statement created") - - blobstore_detail_by_object_type_monthly_create_statement = ( - "create or replace table blobstore_detail_by_object_type_monthly as " - "(select LEFT(object_type,LOCATE('-',object_type) - 1) as object_type, " - "DATE_FORMAT(`save_date`,'%Y-%m') as month, " - "sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, " - "sum(orig_saver * size)/1000000000 as orig_saver_size_GB, " - "0 - sum((orig_saver - 1) * size)/1000000000 as non_orig_saver_size_GB, " - "sum(size)/1000000000 as total_blobstore_size_GB " - "from blobstore_detail bd " - "group by object_type, month) ") - cursor.execute(blobstore_detail_by_object_type_monthly_create_statement) - print("blobstore_detail_by_object_type_monthly_create_statement created") - - - - - return import time From 24a9a912948a73881c1b7f23128be915908e63fe Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Fri, 4 Oct 2024 21:44:15 +0000 Subject: [PATCH 06/11] code to explore and dump using Adams custom app category mappings --- bin/dump_weekly_app_categories_v2.sh | 3 + .../dump_weekly_app_categories_v2.py | 46 ++++ ...s_upload_all_tags_app_category_mappings.py | 229 +++++++++++++++++ ...thods_upload_function_category_mappings.py | 185 ++++++++++++++ ...methods_upload_v2_app_category_mappings.py | 237 ++++++++++++++++++ 5 files changed, 700 insertions(+) create mode 100755 bin/dump_weekly_app_categories_v2.sh create mode 100644 source/custom_scripts/dump_weekly_app_categories_v2.py create mode 100644 source/daily_cron_jobs/methods_upload_all_tags_app_category_mappings.py create mode 100644 source/daily_cron_jobs/methods_upload_function_category_mappings.py create mode 100644 source/daily_cron_jobs/methods_upload_v2_app_category_mappings.py diff --git a/bin/dump_weekly_app_categories_v2.sh b/bin/dump_weekly_app_categories_v2.sh new file mode 100755 index 0000000..3561f07 --- /dev/null +++ b/bin/dump_weekly_app_categories_v2.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python custom_scripts/dump_weekly_app_categories_v2.py diff --git a/source/custom_scripts/dump_weekly_app_categories_v2.py b/source/custom_scripts/dump_weekly_app_categories_v2.py new file mode 100644 index 0000000..e97ec93 --- /dev/null +++ b/source/custom_scripts/dump_weekly_app_categories_v2.py @@ -0,0 +1,46 @@ +#!/usr/local/bin/python + +import os +import mysql.connector as mysql + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +metrics = os.environ["QUERY_ON"] + + +def dump_weekly_app_categories(): + # Dumps the weekly app catagory users report used in the quarterly report + + # connect to mysql + db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" + ) + + cursor = db_connection.cursor() + query = "use " + metrics + cursor.execute(query) + + # CHANGE QUERY HERE + query = ("select * from metrics_reporting.app_category_unique_users_weekly_v2") + # CHANGE COLUMN HEADERS HERE TO MATCH QUERY HEADERS + print("week_run\tapp_category\tunique_users") + + cursor.execute(query) + row_values = list() + + for row_values in cursor: + temp_string = "" + for i in range(len(row_values) - 1): + if row_values[i] is not None: + temp_string += str(row_values[i]) + temp_string += "\t" + if row_values[-1] is not None: + temp_string += str(row_values[-1]) + print(temp_string) + return 1 + + +dump_weekly_app_categories() diff --git a/source/daily_cron_jobs/methods_upload_all_tags_app_category_mappings.py b/source/daily_cron_jobs/methods_upload_all_tags_app_category_mappings.py new file mode 100644 index 0000000..082adb1 --- /dev/null +++ b/source/daily_cron_jobs/methods_upload_all_tags_app_category_mappings.py @@ -0,0 +1,229 @@ +import os +import requests +import pandas as pd +import mysql.connector as mysql +import time +import datetime +from biokbase.catalog.Client import Catalog +from biokbase.narrative_method_store.client import NarrativeMethodStore + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +specific_string = "empty" + +# Configure App Data: Function + +tags = ("release","beta","dev") + +def create_function_dictionary( tag ): + # Create App Dictionary: Main function + requests.packages.urllib3.disable_warnings() + catalog = Catalog(url=os.environ["CATALOG_URL"]) + nms = NarrativeMethodStore(url=os.environ["NARRATIVE_METHOD_STORE"]) + + apps = nms.list_methods({"tag": tag}) +# apps = nms.list_methods({"tag": "release"}) +# apps = nms.list_methods({"tag": "beta"}) +# apps = nms.list_methods({"tag": "dev"}) +# apps = nms.list_methods({}) + + global specific_string + + print("APPS : "+ str(apps)) + print("============================") + + category_app_dict = dict() + #key category,=> dict("active"=>[list of apps], "inactive"=>[list_of_apps], "unknown" => [list of apps]) + + apps_with_both_list = list() + apps_with_none_list = list() + apps_with_no_cats_list = list() + + for temp_app_dict in apps: +# if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assembly_from_staging": +#PRESENT +# if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assemblies_from_staging": +#NOT PRESENT +# temp_specific_string = str(temp_app_dict) +# specific_string = temp_specific_string + "\n" + + if temp_app_dict["id"] == "view_expression_gene_table_heatmap": + print("DETAIL : " + str(temp_app_dict)) + + + app_id = temp_app_dict["id"] + app_cat_list = temp_app_dict["categories"] + + if app_id == "BBTools/RQCFilter": + print("BBTools/RQCFilter app categories : " + str(app_cat_list)) + + if app_id == "view_expression_heatmap": + print("view_expression_heatmap : " + str(app_cat_list)) + + active_type = None + active_flag_has_both = 0 + active_inactive_count = 0 + if "active" in app_cat_list: + active_inactive_count += 1 + if "inactive" in app_cat_list: + active_inactive_count += 1 + if "active" in app_cat_list and "inactive" in app_cat_list: + active_flag_has_both = 1 + print("UH OH!!!!!!!! : " + str(app_id) + " is both active and inactive") + apps_with_both_list.append(app_id) + active_type = "both" +# exit(0) +# else: + elif "active" in app_cat_list: + #CURRENTLY SET IF APP HAS BOTH IS SEEN AS ACTIVE + active_type = "active" + elif "inactive" in app_cat_list: + active_type = "inactive" + if active_type == None: + print("UH OH!!!!!!!! : " + str(app_id) + " is not active or inactive") + apps_with_none_list.append(app_id) + active_type = "none" +# exit(0) + if (len(app_cat_list) - active_inactive_count) <= 0: + apps_with_no_cats_list.append(app_id) + for category_name in app_cat_list: + if category_name == "active" or category_name == "inactive": + continue + if category_name not in category_app_dict: + category_app_dict[category_name] = dict() + if active_type not in category_app_dict[category_name]: + category_app_dict[category_name][active_type] = list() + category_app_dict[category_name][active_type].append(app_id) + + # Deal with apps that have empty category list + if len(apps_with_no_cats_list) > 0: + category_app_dict["Empty Category"] = dict() + category_app_dict["Empty Category"]["no_category"] = apps_with_no_cats_list + + print("FINAL category_app_dict : " + str(category_app_dict)) + total_count = 0 + category_count = 0 +# for temp_cat in app_dict: + for temp_cat in sorted(category_app_dict): + for active_type in category_app_dict[temp_cat]: + temp_count = len(category_app_dict[temp_cat][active_type]) + total_count += temp_count + category_count += 1 + print("Total count : " + str(total_count)) + print("category count : " + str(category_count)) +# print("specific_string : " + str(specific_string)) + print("apps_with_none_list : " + str(apps_with_none_list)) + print("apps_with_none count : " + str(len(apps_with_none_list))) + print("apps_with_both_list : " + str(apps_with_both_list)) + print("apps_with_both count : " + str(len(apps_with_both_list))) + print("apps_with_no_cats_list : " + str(apps_with_no_cats_list)) + print("apps_with_no_cats_list count : " + str(len(apps_with_no_cats_list))) + return category_app_dict + + +def update_app_category_mappings(): +# print("EXITING") +# exit() + + # connect to mysql + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + + for tag in tags: + # get app catagory mappings + cat_app_dict = create_function_dictionary(tag) + + # get existing mappings + existing_records_list = list() + existing_name_cat_dict = dict() + # query = "select concat(app_name, '::', app_category, '::', is_active) from app_name_category_map_v3;" + query = "select app_name, app_category, is_active from app_name_category_map_v3 where tag = \'" + tag + "\';" +# input = (tag) +# cursor.execute(query, tag) + cursor.execute(query) + for row in cursor: + full_key = row[0] + "::" + row[1] + "::" + str(row[2]) + name_cat_key = row[0] + "::" + row[1] + existing_records_list.append(full_key) + existing_name_cat_dict[name_cat_key] = row[2] + existing_count = len(existing_records_list) + + # insert statement + insert_prep_cursor = db_connection.cursor(prepared=True) + + insert_statement = ( + "insert into app_name_category_map_v3 " + "(app_name, app_category, is_active, tag) " + "values(%s, %s, %s, %s);" + ) + + # update statement + update_prep_cursor = db_connection.cursor(prepared=True) + + update_statement = ( + "update app_name_category_map_v3 " + "set is_active = %s where app_name = %s and app_category = %s and tag = %s;" + ) + + # cleanup/delete statement + cleanup_prep_cursor = db_connection.cursor(prepared=True) + cleanup_statement = ( + "delete from app_name_category_map_v3 " + "where app_name = %s and app_category = %s and is_active = %s and tag = %s;" + ) + + insert_count = 0 + update_count = 0 + activity_dict = {'active': 1, 'inactive': 0, 'both': 2, "none":-1, "no_category":-2} + for category_name in cat_app_dict: + for active_type in cat_app_dict[category_name]: + for app_name in cat_app_dict[category_name][active_type]: + temp_key = app_name + "::" + category_name + "::" + str(activity_dict[active_type]) + temp_name_cat_key = app_name + "::" + category_name + if temp_name_cat_key in existing_name_cat_dict: + if activity_dict[active_type] != existing_name_cat_dict[temp_name_cat_key]: + # record needs to be updated + input = (activity_dict[active_type], app_name, category_name, tag) + update_prep_cursor.execute(update_statement, input) + update_count += 1 + if temp_key in existing_records_list: + existing_records_list.remove(temp_key) + elif temp_key in existing_records_list: + existing_records_list.remove(temp_key) + #REMOVE FOM EXISTING TO FIND LEFT OVERS + else: + # do insert + # print("INPUT : " + str(input)) + input = (app_name, category_name, activity_dict[active_type], tag) + insert_prep_cursor.execute(insert_statement, input) + insert_count += 1 + + #Clean up that no longer exist + cleanup_count = 0 + for temp_key in existing_records_list: + cleanup_count += 1 + temp_app_name, temp_cat_name, temp_is_active = temp_key.split('::') + input = (temp_app_name, temp_cat_name, int(temp_is_active), tag) + cleanup_prep_cursor.execute(cleanup_statement, input) + + db_connection.commit() + print("RESULTS FOR TAG : " + tag) + print("Existing_count : " + str(existing_count)) + print("Insert_count : " + str(insert_count)) + print("Update_count : " + str(update_count)) + print("Cleanup_count : " + str(cleanup_count)) + + + +print("############################################") +print("App Category Mapping Upload (UTC): " + str(datetime.datetime.utcnow())) +start_time = time.time() +update_app_category_mappings() +print("--- app_cat_mapping time : %s seconds ---" % (time.time() - start_time)) diff --git a/source/daily_cron_jobs/methods_upload_function_category_mappings.py b/source/daily_cron_jobs/methods_upload_function_category_mappings.py new file mode 100644 index 0000000..b8f6ffc --- /dev/null +++ b/source/daily_cron_jobs/methods_upload_function_category_mappings.py @@ -0,0 +1,185 @@ +import os +import requests +import pandas as pd +import mysql.connector as mysql +import time +import datetime +from biokbase.catalog.Client import Catalog +from biokbase.narrative_method_store.client import NarrativeMethodStore + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +specific_string = "empty" + +# Configure App Data: Function +def data_configure(app_df): + category_mess = list(app_df.categories) +# filters = ["inactive", "viewers"] +# filters = ["inactive"] + filters = ["viewers"] +# filters = [] + my_idx_list, categories, app_ids = [], [], [] + + for idx, lst in enumerate(category_mess): + if any([True for e in lst if e in filters]): + my_idx_list.append(idx) + else: + lst = [x for x in lst if "active" != x] + if lst: + categories.append(lst) + else: + my_idx_list.append(idx) + + modDF = app_df.drop(my_idx_list) + modDF.categories = categories + return modDF + + +def create_function_dictionary(): + # Create App Dictionary: Main function + requests.packages.urllib3.disable_warnings() + catalog = Catalog(url=os.environ["CATALOG_URL"]) + nms = NarrativeMethodStore(url=os.environ["NARRATIVE_METHOD_STORE"]) + + apps = nms.list_methods({"tag": "release"}) +# apps = nms.list_methods({"tag": "beta"}) +# apps = nms.list_methods({"tag": "dev"}) +# apps = nms.list_methods({}) + + global specific_string + + print("APPS : "+ str(apps)) + print("============================") + + for temp_app_dict in apps: + if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assembly_from_staging": +#PRESENT +# if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assemblies_from_staging": +#NOT PRESENT + temp_specific_string = str(temp_app_dict) + specific_string = temp_specific_string + "\n" + + apps_datastruc = pd.DataFrame.from_dict(apps) + ModDfApps = data_configure(apps_datastruc) + ModDfApps.drop( + [ + "app_type", + "authors", + "git_commit_hash", + "icon", + "input_types", + "module_name", + "name", + "namespace", + "output_types", + "subtitle", + "tooltip", + "ver", + ], + axis=1, + inplace=True, + ) + keys = list( + set([item for sublist in list(ModDfApps.categories) for item in sublist]) + ) + print("KEYS : " + str(keys)) + print("============================") + app_dict = {k: [] for k in keys} + + print("app_dict : " + str(app_dict)) + print("============================") + + for i in ModDfApps.index.values: + app_category_lst = ModDfApps["categories"][i] + for category in app_category_lst: + if category in app_dict.keys(): + app_dict[category].append(ModDfApps["id"][i]) + app_dict[category] = list(set(app_dict[category])) + else: + raise KeyError("{} not a KBase app category".format(category)) + print("FINAL app_dict : " + str(app_dict)) + total_count = 0 + category_count = 0 +# for temp_cat in app_dict: + for temp_cat in sorted(app_dict): + temp_count = len(app_dict[temp_cat]) + print(temp_cat + " : " + str(temp_count)) + total_count += temp_count + category_count += 1 + print("Total count : " + str(total_count)) + print("category count : " + str(category_count)) + print("specific_string : " + str(specific_string)) + return app_dict + + +def update_app_category_mappings(): + # connect to mysql + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + # get existing mappings + existing_records_list = list() + query = "select concat(app_name, '::', app_category) " "from app_name_category_map" + cursor.execute(query) + for row in cursor: + existing_records_list.append(row[0]) + + # update all existing records to be inactive + update_query = "update app_name_category_map set is_active = False" + cursor.execute(update_query) + db_connection.commit() + + cat_app_dict = create_function_dictionary() + + print("EXITING") + exit() + + # update active records if they exist or insert new row if did not exist + # update statement + update_prep_cursor = db_connection.cursor(prepared=True) + update_statement = ( + "update app_name_category_map " + "set is_active = True " + "where app_name = %s and " + "app_category = %s " + ) + # insert statement + insert_prep_cursor = db_connection.cursor(prepared=True) + existing_count = len(existing_records_list) + insert_statement = ( + "insert into app_name_category_map " + "(app_name, app_category, is_active) " + "values(%s, %s, True);" + ) + insert_count = 0 + update_count = 0 + for category_name in cat_app_dict: + for app_name in cat_app_dict[category_name]: + input = (app_name, category_name) + if app_name + "::" + category_name in existing_records_list: + # do update + update_prep_cursor.execute(update_statement, input) + update_count += 1 + else: + # do insert + insert_prep_cursor.execute(insert_statement, input) + insert_count += 1 + + db_connection.commit() + print("Existing_count : " + str(existing_count)) + print("Insert_count : " + str(insert_count)) + print("Update_count : " + str(update_count)) + + + +print("############################################") +print("App Category Mapping Upload (UTC): " + str(datetime.datetime.utcnow())) +start_time = time.time() +update_app_category_mappings() +print("--- app_cat_mapping time : %s seconds ---" % (time.time() - start_time)) diff --git a/source/daily_cron_jobs/methods_upload_v2_app_category_mappings.py b/source/daily_cron_jobs/methods_upload_v2_app_category_mappings.py new file mode 100644 index 0000000..4929543 --- /dev/null +++ b/source/daily_cron_jobs/methods_upload_v2_app_category_mappings.py @@ -0,0 +1,237 @@ +import os +import requests +import pandas as pd +import mysql.connector as mysql +import time +import datetime +from biokbase.catalog.Client import Catalog +from biokbase.narrative_method_store.client import NarrativeMethodStore + +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +specific_string = "empty" + +# Configure App Data: Function + +def create_function_dictionary(): + # Create App Dictionary: Main function + requests.packages.urllib3.disable_warnings() + catalog = Catalog(url=os.environ["CATALOG_URL"]) + nms = NarrativeMethodStore(url=os.environ["NARRATIVE_METHOD_STORE"]) + + apps = nms.list_methods({"tag": "release"}) +# apps = nms.list_methods({"tag": "beta"}) +# apps = nms.list_methods({"tag": "dev"}) +# apps = nms.list_methods({}) + + global specific_string + + print("APPS : "+ str(apps)) + print("============================") + + category_app_dict = dict() + #key category,=> dict("active"=>[list of apps], "inactive"=>[list_of_apps], "unknown" => [list of apps]) + + apps_with_both_list = list() + apps_with_none_list = list() + apps_with_no_cats_list = list() + + for temp_app_dict in apps: +# if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assembly_from_staging": +#PRESENT +# if temp_app_dict["id"] == "kb_uploadmethods/batch_import_assemblies_from_staging": +#NOT PRESENT +# temp_specific_string = str(temp_app_dict) +# specific_string = temp_specific_string + "\n" + + if temp_app_dict["id"] == "view_expression_gene_table_heatmap": + print("DETAIL : " + str(temp_app_dict)) + + + app_id = temp_app_dict["id"] + app_cat_list = temp_app_dict["categories"] + + if app_id == "BBTools/RQCFilter": + print("BBTools/RQCFilter app categories : " + str(app_cat_list)) + + if app_id == "view_expression_heatmap": + print("view_expression_heatmap : " + str(app_cat_list)) + + active_type = None + active_flag_has_both = 0 + active_inactive_count = 0 + if "active" in app_cat_list: + active_inactive_count += 1 + if "inactive" in app_cat_list: + active_inactive_count += 1 + if "active" in app_cat_list and "inactive" in app_cat_list: + active_flag_has_both = 1 + print("UH OH!!!!!!!! : " + str(app_id) + " is both active and inactive") + apps_with_both_list.append(app_id) + active_type = "both" +# exit(0) +# else: + elif "active" in app_cat_list: + #CURRENTLY SET IF APP HAS BOTH IS SEEN AS ACTIVE + active_type = "active" + elif "inactive" in app_cat_list: + active_type = "inactive" + if active_type == None: + print("UH OH!!!!!!!! : " + str(app_id) + " is not active or inactive") + apps_with_none_list.append(app_id) + active_type = "none" +# exit(0) + if (len(app_cat_list) - active_inactive_count) <= 0: + apps_with_no_cats_list.append(app_id) + for category_name in app_cat_list: + if category_name == "active" or category_name == "inactive": + continue + if category_name not in category_app_dict: + category_app_dict[category_name] = dict() + if active_type not in category_app_dict[category_name]: + category_app_dict[category_name][active_type] = list() + category_app_dict[category_name][active_type].append(app_id) + + # Deal with apps that have empty category list + if len(apps_with_no_cats_list) > 0: + category_app_dict["Empty Category"] = dict() + category_app_dict["Empty Category"]["no_category"] = apps_with_no_cats_list + + print("FINAL category_app_dict : " + str(category_app_dict)) + total_count = 0 + category_count = 0 +# for temp_cat in app_dict: + for temp_cat in sorted(category_app_dict): + for active_type in category_app_dict[temp_cat]: + temp_count = len(category_app_dict[temp_cat][active_type]) + total_count += temp_count + category_count += 1 + print("Total count : " + str(total_count)) + print("category count : " + str(category_count)) +# print("specific_string : " + str(specific_string)) + print("apps_with_none_list : " + str(apps_with_none_list)) + print("apps_with_none count : " + str(len(apps_with_none_list))) + print("apps_with_both_list : " + str(apps_with_both_list)) + print("apps_with_both count : " + str(len(apps_with_both_list))) + print("apps_with_no_cats_list : " + str(apps_with_no_cats_list)) + print("apps_with_no_cats_list count : " + str(len(apps_with_no_cats_list))) + return category_app_dict + + +def update_app_category_mappings(): + # get app catagory mappings + cat_app_dict = create_function_dictionary() + +# print("EXITING") +# exit() + + # connect to mysql + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + # get existing mappings + existing_records_list = list() + existing_name_cat_dict = dict() +# query = "select concat(app_name, '::', app_category, '::', is_active) from app_name_category_map_v2;" + query = "select app_name, app_category, is_active from app_name_category_map_v2;" + cursor.execute(query) + for row in cursor: + full_key = row[0] + "::" + row[1] + "::" + str(row[2]) + name_cat_key = row[0] + "::" + row[1] + existing_records_list.append(full_key) + existing_name_cat_dict[name_cat_key] = row[2] + existing_count = len(existing_records_list) + + # update all existing records to be inactive +# update_query = "update app_name_category_map_v2 set is_active = False" +# cursor.execute(update_query) +# db_connection.commit() + + # update active records if they exist or insert new row if did not exist + # update statement +# update_prep_cursor = db_connection.cursor(prepared=True) +# update_statement = ( +# "update app_name_category_map_v2 " +# "set is_active = %s " +# "where app_name = %s and " +# "app_category = %s " +# ) + + + # insert statement + insert_prep_cursor = db_connection.cursor(prepared=True) + + insert_statement = ( + "insert into app_name_category_map_v2 " + "(app_name, app_category, is_active) " + "values(%s, %s, %s);" + ) + + # update statement + update_prep_cursor = db_connection.cursor(prepared=True) + + update_statement = ( + "update app_name_category_map_v2 " + "set is_active = %s where app_name = %s and app_category = %s;" + ) + + # cleanup/delete statement + cleanup_prep_cursor = db_connection.cursor(prepared=True) + cleanup_statement = ( + "delete from app_name_category_map_v2 " + "where app_name = %s and app_category = %s and is_active = %s;" + ) + + insert_count = 0 + update_count = 0 + activity_dict = {'active': 1, 'inactive': 0, 'both': 2, "none":-1, "no_category":-2} + for category_name in cat_app_dict: + for active_type in cat_app_dict[category_name]: + for app_name in cat_app_dict[category_name][active_type]: + temp_key = app_name + "::" + category_name + "::" + str(activity_dict[active_type]) + temp_name_cat_key = app_name + "::" + category_name + if temp_name_cat_key in existing_name_cat_dict: + if activity_dict[active_type] != existing_name_cat_dict[temp_name_cat_key]: + # record needs to be updated + input = (activity_dict[active_type], app_name, category_name,) + update_prep_cursor.execute(update_statement, input) + update_count += 1 + if temp_key in existing_records_list: + existing_records_list.remove(temp_key) + elif temp_key in existing_records_list: + existing_records_list.remove(temp_key) + #REMOVE FOM EXISTING TO FIND LEFT OVERS + else: + # do insert +# print("INPUT : " + str(input)) + input = (app_name, category_name, activity_dict[active_type]) + insert_prep_cursor.execute(insert_statement, input) + insert_count += 1 + + #Clean up that no longer exist + cleanup_count = 0 + for temp_key in existing_records_list: + cleanup_count += 1 + temp_app_name, temp_cat_name, temp_is_active = temp_key.split('::') + input = (temp_app_name, temp_cat_name, int(temp_is_active)) + cleanup_prep_cursor.execute(cleanup_statement, input) + + db_connection.commit() + print("Existing_count : " + str(existing_count)) + print("Insert_count : " + str(insert_count)) + print("Update_count : " + str(update_count)) + print("Cleanup_count : " + str(cleanup_count)) + + + +print("############################################") +print("App Category Mapping Upload (UTC): " + str(datetime.datetime.utcnow())) +start_time = time.time() +update_app_category_mappings() +print("--- app_cat_mapping time : %s seconds ---" % (time.time() - start_time)) From f616b53c69a49b77247fc1416cfe3b735aa594ce Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Tue, 8 Oct 2024 04:02:02 +0000 Subject: [PATCH 07/11] mostly adding blobstore_details to reports and views --- .../backfill_blobstore_details.py | 242 --------- .../dump_weekly_ADAM_app_categories.py | 5 +- .../daily_cron_jobs/make_reporting_tables.py | 25 +- .../get_downloaders_lookup_ongoing.py | 487 ++++++++++++++++++ .../sql_reporting_views_and_tables.sql | 25 + 5 files changed, 539 insertions(+), 245 deletions(-) delete mode 100644 source/custom_scripts/backfill_blobstore_details.py create mode 100644 source/monthly_cron_jobs/get_downloaders_lookup_ongoing.py diff --git a/source/custom_scripts/backfill_blobstore_details.py b/source/custom_scripts/backfill_blobstore_details.py deleted file mode 100644 index 8111e82..0000000 --- a/source/custom_scripts/backfill_blobstore_details.py +++ /dev/null @@ -1,242 +0,0 @@ -from pymongo import MongoClient -from pymongo import ReadPreference -from biokbase.workspace.client import Workspace -#from installed_clients.AbstractHandleClient import AbstractHandle as HandleService -from biokbase.service.Client import Client as ServiceClient -import json as _json -import os -import mysql.connector as mysql -import requests -import time -#from splitting import split_sequence -from datetime import date -from datetime import datetime - -print("############################################") -print("############################################") -print("############################################") -print("START TIME (UTC): " + str(datetime.utcnow())) -start_time = time.time() - -requests.packages.urllib3.disable_warnings() - -mongoDB_metrics_connection = os.environ["MONGO_PATH"] - -ws_url = os.environ["WS_URL"] -ws_user_token = os.environ["METRICS_WS_USER_TOKEN"] -to_workspace = os.environ["WRK_SUFFIX"] - -to_blobstore = os.environ["BLOBSTORE_SUFFIX"] -to_handle_db = os.environ["HANDLE_DB_SUFFIX"] - - -client = MongoClient(mongoDB_metrics_connection + to_workspace) -db = client.workspace -handle_service_url = "https://kbase.us/services/handle_service" - -#wsadmin = Workspace(ws_url, token=ws_user_token) -#hs = HandleService(handle_service_url, token=ws_user_token) - -def make_blobstore_lookup (): - client_blobstore = MongoClient(mongoDB_metrics_connection + to_blobstore) - db_blobstore = client_blobstore.blobstore - - blobstore_nodes_size_lookup = dict() - - nodes_query = db_blobstore.nodes.find({},{"_id": 0, "id": 1, "size": 1}) - for record in nodes_query: - blobstore_node_id = record["id"] - size = record["size"] - blobstore_nodes_size_lookup[blobstore_node_id] = size - return blobstore_nodes_size_lookup - -def make_handle_id_lookup (): - client_handle_db = MongoClient(mongoDB_metrics_connection + to_handle_db) - db_handle = client_handle_db.handle_db - - handle_id_lookup = dict() - - handles_query = db_handle.handle.find({},{"_id": 0, "id": 1, "hid": 1}) - for record in handles_query: - blobstore_node_id = record["id"] - handle = record["hid"] - handle_id_lookup[handle] = blobstore_node_id - return handle_id_lookup - - - -# object_id -> {handle=>handle, node=node, type=object_type, savedate=> sd} -objects_with_problem_nodes_with_no_size = dict() -objects_with_problem_handles_with_no_nodes = dict() - -running_size_total = 0 - -deleted_object_with_data_found_count = 0 -deleted_object_without_data_found_count = 0 - -#exit() - - -# blobstore_id => {ws_obj_id => (save_date, saver)} -blobstore_object_results = dict() - -# blobstore_id =>{first_saver_ws_obj_id => blah, -# first_save_date = date} -#blobstore_id_first_saver = dict() - -#ws_ids = [146324] # small -#ws_ids = [28129] # fungal phytosome s -#ws_ids = [146324,28129] # fungal phytosome and small ws, took 203 mins -#ws_ids = [19217] # refseq reference - - - -#for ws_id in ws_ids: -deleted_objects = set() -ws_obj_deleted_cursor = db.workspaceObjects.find({"del":True},{"_id":0, "ws": 1,"id":1}) -for ws_obj_deleted in ws_obj_deleted_cursor: - deleted_temp_ws_id = ws_obj_deleted["ws"] - deleted_obj_id = ws_obj_deleted["id"] - deleted_ws_obj_id = str(deleted_temp_ws_id) + "/" + str(deleted_obj_id) - deleted_objects.add(deleted_ws_obj_id) - -print("TOTAL DELETED OBJECT LENGTH: " + str(len(deleted_objects))) -print("--- total time for the deleted objects lookup %s seconds ---" % (time.time() - start_time)) - -ws_obj_vers_cursor = db.workspaceObjVersions.find( - {#"ws":312, - "extids.handle" : { "$exists": True }}, - { - "type": 1, - "ws": 1, - "id": 1, - "ver": 1, - "savedate": 1, - "savedby": 1, - "extids": 1, - "_id": 0, - }, - no_cursor_timeout=True - ) -i = 0 -ws_obj_info = dict() -deleted_ext_ids_counter = 0 - -for ws_obj_ver in ws_obj_vers_cursor: - is_deleted = 0 - object_type_full = ws_obj_ver["type"] - (object_type, object_spec_version) = object_type_full.split("-") - #if (object_type != "KBaseNarrative.Narrative" and object_type != "KBaseReport.Report"): - ws_id = ws_obj_ver["ws"] - obj_id = ws_obj_ver["id"] - temp_ws_obj_id = str(ws_id) + "/" + str(obj_id) - if temp_ws_obj_id in deleted_objects: - deleted_ext_ids_counter += 1 - is_deleted = 1 -# continue - obj_ver = ws_obj_ver["ver"] - obj_save_date = ws_obj_ver["savedate"] - savedby = ws_obj_ver["savedby"] - extids = ws_obj_ver["extids"] - handles = extids["handle"] -# for handle in handles: -# handles_set.add(handle) -# obj_copied = 0 - full_obj_id = str(ws_id) + "/" + str(obj_id) + "/" + str(obj_ver) -# print("Full obj id : " + full_obj_id) -# print("Object Type : " + object_type_full) -# if (object_type != "KBaseNarrative.Narrative" and object_type != "KBaseReport.Report"): -# if (object_type == "KBaseNarrative.Narrative" or object_type == "KBaseReport.Report"): - - ws_obj_info[full_obj_id] = {"save_date" : obj_save_date, - "savedby" : savedby, - "obj_type" : object_type_full, - "handles" : handles, - "is_deleted" : is_deleted} - -print("--- total time for the ws_object_version objects query %s seconds ---" % (time.time() - start_time)) - -########################################################################## -print("BLOBSTORE LOOKUP:") -blobstore_lookup = make_blobstore_lookup() -test_counter = 0 -for temp_key in blobstore_lookup: - if test_counter < 10: - print("ID: " + str(temp_key) + " ::: size: " + str(blobstore_lookup[temp_key])) - else: - break - test_counter = test_counter + 1 -print("Total BLOBSTORE Lookuplength: " + str(len(blobstore_lookup))) - -print("--- total time for the blobstore size lookup creation %s seconds ---" % (time.time() - start_time)) - -handle_id_lookup = make_handle_id_lookup() -test_counter = 0 -for temp_key in handle_id_lookup: - if test_counter < 10: - print("ID: " + str(temp_key) + " ::: blobstore_id: " + str(handle_id_lookup[temp_key])) - else: - break - test_counter = test_counter + 1 -print("Total HANDLE ID lookup length: " + str(len(handle_id_lookup))) - -print("--- total time for the blobstore size lookup creation %s seconds ---" % (time.time() - start_time)) -############################################## - -for full_obj_id in ws_obj_info: -# print("ws_obj_info[full_obj_id][handles] : " + str(ws_obj_info[full_obj_id]["handles"])) - for handle in ws_obj_info[full_obj_id]["handles"]: - blobstore_id = None - (kbh_prefix, str_handle_id) = handle.split("_") - if int(str_handle_id) in handle_id_lookup: - blobstore_id = handle_id_lookup[int(str_handle_id)] - else: - objects_with_problem_handles_with_no_nodes[full_obj_id] = ws_obj_info[full_obj_id] - if ws_obj_info[full_obj_id]["is_deleted"] == 1: - deleted_object_without_data_found_count += 1 - - if blobstore_id and blobstore_id in blobstore_lookup: - if blobstore_id not in blobstore_object_results: - blobstore_object_results[blobstore_id] = dict() - blobstore_object_results[blobstore_id][full_obj_id] = (ws_obj_info[full_obj_id]["save_date"], - ws_obj_info[full_obj_id]["savedby"]) -# print("Blobstore lookup file_size : " + str(blobstore_lookup[blobstore_id])) - if ws_obj_info[full_obj_id]["is_deleted"] == 1: - deleted_object_with_data_found_count += 1 - file_size = blobstore_lookup[blobstore_id] - running_size_total = running_size_total + file_size - else: -# print("HUGE PROBLEM: obj_id : " + full_obj_id + " blobstore_id: " + str(blobstore_id) + " IS NOT IN THE LOOKUP") -# del blobstore_object_results[blobstore_id] - objects_with_problem_nodes_with_no_size[full_obj_id] = ws_obj_info[full_obj_id] - if ws_obj_info[full_obj_id]["is_deleted"] == 1: - deleted_object_without_data_found_count += 1 - -print("objects_with_problem_nodes_with_no_size : " + str(objects_with_problem_nodes_with_no_size)) -print("TOTAL objects_with_problem_nodes_with_no_size : " + str(len(objects_with_problem_nodes_with_no_size))) - -print("objects_with_problem_handles_with_no_nodes : " + str(objects_with_problem_handles_with_no_nodes)) -print("TOTAL objects_with_problem_handles_with_no_nodes : " + str(len(objects_with_problem_handles_with_no_nodes))) - -print("deleted_object_with_data_found_count :" + str(deleted_object_with_data_found_count)) -print("deleted_object_without_data_found_count :" + str(deleted_object_without_data_found_count)) - -print("blobstore_object_results length : " + str(len(blobstore_object_results))) -#print("blobstore_object_results : " + str(blobstore_object_results)) -print("RUNNING TOTAL SIZE : " + str(running_size_total)) - -obj_id_set = set() -for blobstore_id in blobstore_object_results : - for obj_id in blobstore_object_results[blobstore_id]: - obj_id_set.add(obj_id) -print("Total number of objects with handles that could be fully determined : " + str(len(obj_id_set))) - -print("Total ext_ids objects that were deleted : " + str(deleted_ext_ids_counter)) - -#print("blobstore_object_results : " + str(blobstore_object_results)) - -print("--- total seconds %s seconds ---" % (time.time() - start_time)) - - - -exit() diff --git a/source/custom_scripts/dump_weekly_ADAM_app_categories.py b/source/custom_scripts/dump_weekly_ADAM_app_categories.py index 3fc67ae..9ac842b 100644 --- a/source/custom_scripts/dump_weekly_ADAM_app_categories.py +++ b/source/custom_scripts/dump_weekly_ADAM_app_categories.py @@ -24,7 +24,10 @@ def dump_weekly_app_categories(): cursor.execute(query) # CHANGE QUERY HERE - query = ("select * from metrics_reporting.app_category_unique_users_weekly") +# Regular weekly app categories +# query = ("select * from metrics_reporting.app_category_unique_users_weekly") + + # ADAM's special cagtegory mappings from late 2023 early 2024 query = ("select in_query.week_run, in_query.master_category, count(*) as unique_users " "from (select distinct DATE_FORMAT(`finish_date`,'%Y-%u') as week_run, " "IFNULL(master_category,'None') as master_category, uau.username " diff --git a/source/daily_cron_jobs/make_reporting_tables.py b/source/daily_cron_jobs/make_reporting_tables.py index cf1a0ed..91ffc0d 100644 --- a/source/daily_cron_jobs/make_reporting_tables.py +++ b/source/daily_cron_jobs/make_reporting_tables.py @@ -167,7 +167,24 @@ def make_reporting_tables(): cursor.execute(app_category_run_hours_weekly_create_statement) print("app_category_run_hours_weekly created") + ############### + workspaces_current_create_statement = ( + "CREATE OR REPLACE table metrics.workspaces_current as " + "(select ws.* " + "from metrics.workspaces ws inner join " + "metrics.hv_workspaces_max_date wsmd " + "on ws.ws_id = wsmd.ws_id and " + "ws.record_date = wsmd.record_date) " + ) + cursor.execute(workspaces_current_create_statement) + print("workspaces_current created") + workspaces_current_index_create_statement = ( + "alter table metrics.workspaces_current add unique (ws_id)" + ) + cursor.execute(workspaces_current_index_create_statement) + print("workspaces_current_index created") + ################ narrative_app_flows_create_statement = ( "create or replace table metrics_reporting.narrative_app_flows as " @@ -175,7 +192,7 @@ def make_reporting_tables(): "from metrics.user_info ui " "inner join metrics.user_app_usage uau " "on ui.username = uau.username " - "inner join metrics_reporting.workspaces_current wc " + "inner join metrics.workspaces_current wc " "on wc.ws_id = uau.ws_id " "where ui.kb_internal_user = 0 " "and uau.is_error = 0 " @@ -205,6 +222,10 @@ def make_reporting_tables(): cursor.execute(blobstore_detail_by_ws_create_statement) print("blobstore_detail_by_ws_create_statement created") + blobstore_detail_by_ws_index_statement = "alter table blobstore_detail_by_ws add index (ws_id)" + cursor.execute(blobstore_detail_by_ws_index_statement) + print("blobstore_detail_by_ws_index_statement created") + blobstore_detail_by_user_monthly_create_statement = ( "create or replace table blobstore_detail_by_user_monthly as " "(select saver_username, DATE_FORMAT(`save_date`,'%Y-%m') as month, " @@ -328,7 +349,7 @@ def make_reporting_tables(): "sum(static_narratives_count) as static_narratives_created_count, " "sum(visible_app_cells_count) as total_visible_app_cells, " "sum(code_cells_count) as total_code_cells_count " - "from metrics_reporting.workspaces_current wc " + "from metrics.workspaces_current wc " "inner join metrics.user_info ui " "on wc.username = ui.username " "where narrative_version > 0 " diff --git a/source/monthly_cron_jobs/get_downloaders_lookup_ongoing.py b/source/monthly_cron_jobs/get_downloaders_lookup_ongoing.py new file mode 100644 index 0000000..fb964be --- /dev/null +++ b/source/monthly_cron_jobs/get_downloaders_lookup_ongoing.py @@ -0,0 +1,487 @@ +# GetAppStats +# +import requests +import os +import time +from pymongo import MongoClient +from pymongo import ReadPreference + +from datetime import date, timedelta, datetime +import mysql.connector as mysql +from biokbase.narrative_method_store.client import NarrativeMethodStore +#from source.daily_cron_jobs.installed_clients.execution_engine2Client import execution_engine2 +from installed_clients.execution_engine2Client import execution_engine2 + +################################################ +# +# This code is to pull the needed downloader app runs that may have been downloaders for DOI objects +# +################################################ + +requests.packages.urllib3.disable_warnings() + +to_workspace = os.environ["WRK_SUFFIX"] + +ee2_url = os.environ["EE2_URL"] +# GetEE2AppStats +ee2 = execution_engine2( + url=ee2_url, + token=os.environ["METRICS_USER_TOKEN"], +) + +nms = NarrativeMethodStore(url=os.environ["NARRATIVE_METHOD_STORE"]) +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +mongoDB_metrics_connection = os.environ["MONGO_PATH"] +metrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] + +db_connection = mysql.connect( + host=sql_host, # "mysql1", #"localhost", + user="metrics", # "root", + passwd=metrics_mysql_password, + database="metrics", # "datacamp" +) + +cursor = db_connection.cursor() + +models_bulk_dl_list = list() + +def get_minimum_date_for_new_doi_workspaces(cursor): + #First Determine the default being the start of the previous month + #get first day of the month: + query = ( + "select min(initial_save_date) from metrics_reporting.workspaces_current where ws_id in ( " + " select ws_id from metrics.copy_doi_ws_map " + " where ws_id not in (select unique ws_id from metrics.copy_doi_metrics)) " + ) + cursor.execute(query) + min_new_doi_ws_date = None + for row_values in cursor: + min_new_doi_ws_date = row_values[0] + print("MIN NEW DOI WS DATE:" + str(min_new_doi_ws_date)) + return min_new_doi_ws_date + +def get_existing_problem_refs(cursor): + # builds data structure for problematic references previously resolved + query = ( + "select job_id, original_ref_id, resolved_ref_id " + "from downloaders_problematic_obj_ids" + ) + cursor.execute(query) + problem_refs_lookup = dict() + for row_values in cursor: + job_id = row_values[0] + original_ref_id = row_values[1] + resolved_ref_id = row_values[2] + if job_id not in problem_refs_lookup: + problem_refs_lookup[job_id] = dict() + problem_refs_lookup[job_id][original_ref_id] = resolved_ref_id + return problem_refs_lookup + +def get_minimum_date_for_doi_workspaces(cursor): + # gets earliest initial save out of all the doi workspaces + query = ( + "select min(initial_save_date) from metrics_reporting.workspaces_current where ws_id in ( " + " select ws_id from metrics.copy_doi_ws_map) ") + cursor.execute(query) + min_new_doi_ws_date = None + for row_values in cursor: + min_doi_ws_date = row_values[0] + print("MIN DOI WS DATE:" + str(min_new_doi_ws_date)) + return min_doi_ws_date + + +def get_downloaders_set(cursor): + #returns a set of downloadwer apps +# query = "select downloader_app_name, 1 from metrics.downloader_apps"; + query = ( + "select downloader_app_name as app_name from downloader_apps da " + "union select distinct uau.func_name from user_app_usage uau " + "where (uau.func_name like '%export%' or uau.func_name like '%download%' or " + "uau.app_name like '%export%' or uau.app_name like '%download%' or " + "uau.func_name like 'kb_ObjectInfo%' or uau.app_name like 'kb_ObjectInfo%') ") +# query = ( +# "select downloader_app_name as app_name from downloader_apps da " +# "union select uau.func_name from user_app_usage uau " +# "where (uau.func_name like '%export%' or uau.func_name like '%download%' " +# "or uau.app_name like '%export%' or uau.app_name like '%download%') ") + + cursor.execute(query) + downloaders_set = set() + for row_values in cursor: + downloaders_set.add(row_values[0]) + print(str(downloaders_set)) + print("Number of downloaders : " + str(len(downloaders_set))) + return downloaders_set + +def pull_downloading_jobs(downloaders_set, problem_refs_lookup): + + client = MongoClient(mongoDB_metrics_connection + to_workspace) + db = client.workspace + + prep_cursor = db_connection.cursor(prepared=True) + downloaders_problematic_obj_ids_insert_statement = ( + "insert into downloaders_problematic_obj_ids " + "(original_ref_id, resolved_ref_id, job_id) " + "values(%s,%s, %s);") + insert_prob_refs_count = 0 + + statuses = ["queued", "terminated", "running", "created", "estimated","error"] + finished_job_count = 0 + downloaders_count = 0 + downloading_jobs_with_orphaned_refs_count = 0 + downloading_triples_not_digits_count = 0 + downloaders_with_ws_id_count = 0 + in_if_count = 0 + + downloaders_dict = dict() + for downloader in downloaders_set: + downloaders_dict[downloader] = dict() + downloaders_dict[downloader]["has_input_ref_count"] = 0 + downloaders_dict[downloader]["no_input_ref_count"] = 0 + + downloader_results = dict() + # the data structure looks like downloaded_ws_obj_id => { downloader_username => [job_id]} + has_2_elements_count = 0 + + + earliest_year = 2023 +# earliest_year = 2016 + today = date.today() + current_year = int(today.year) + part_of_year_list = (1,2,3,4,5,6,7,8,9,10,11,12) + + years_to_do = range(earliest_year,(current_year + 1)) + + print("Current year : " + str(current_year)) + print("Years to do: " + str(years_to_do)) + + fba_tools_bulk_export_objects_jobs = list() + DataFileUtil_download_web_file_jobs = list() + + for year_to_do in years_to_do: + # NEED TO CHUNK UP THE RESULTS BY QUARTER, OTHERWISE EE@ TIMESOUT. + for part_of_year in part_of_year_list: + if part_of_year == 1: + begin = int(datetime(year_to_do, 1, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 1, 31, 23, 59).timestamp()) * 1000 + elif part_of_year == 2: + begin = int(datetime(year_to_do, 2, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 3, 1, 23, 59).timestamp()) * 1000 + elif part_of_year == 3: + begin = int(datetime(year_to_do, 3, 2, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 3, 31, 23, 59).timestamp()) * 1000 + elif part_of_year == 4: + begin = int(datetime(year_to_do, 4, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 4, 30, 23, 59).timestamp()) * 1000 + elif part_of_year == 5: + begin = int(datetime(year_to_do, 5, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 5, 31, 23, 59).timestamp()) * 1000 + elif part_of_year == 6: + begin = int(datetime(year_to_do, 6, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 6, 30, 23, 59).timestamp()) * 1000 + elif part_of_year == 7: + begin = int(datetime(year_to_do, 7, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 7, 31, 23, 59).timestamp()) * 1000 + elif part_of_year == 8: + begin = int(datetime(year_to_do, 8, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 8, 30, 23, 59).timestamp()) * 1000 + elif part_of_year == 9: + begin = int(datetime(year_to_do, 9, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 9, 30, 23, 59).timestamp()) * 1000 + elif part_of_year == 10: + begin = int(datetime(year_to_do, 10, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 10, 31, 23, 59).timestamp()) * 1000 + elif part_of_year == 11: + begin = int(datetime(year_to_do, 11, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 11, 30, 23, 59).timestamp()) * 1000 + elif part_of_year == 12: + begin = int(datetime(year_to_do, 12, 1, 0, 0).timestamp()) * 1000 + end = int(datetime(year_to_do, 12, 31, 23, 59).timestamp()) * 1000 + + yearly_start_time = time.time() + print("Year_Month to do start: " + str(year_to_do) + "_" + str(part_of_year) + " :: " + str(yearly_start_time)) + + params = {"start_time": begin, "end_time": end, "ascending": 0, "limit": 1000000000} + stats = ee2.check_jobs_date_range_for_all(params=params) + + yearly_finished_count = 0 + yearly_downloader_count = 0 + + example_counter = 0 + download_job_without_input_ref_count = 0 + + kbObjectInfo_dict = dict() + + fba_tools_bulk_export_objects_job_count = 0 + + for job in stats["jobs"]: + if job["status"] in statuses or "finished" not in job: + continue + else: + # only want non errored finished jobs + if "job_input" in job and "job_id" in job and "user" in job: + in_if_count += 1 + method = job["job_input"]["method"] + app_id = job["job_input"]["app_id"] + method = method.replace(".", "/") + if method in downloaders_set or app_id in downloaders_set: + if method == "DataFileUtil/download_web_file": + DataFileUtil_download_web_file_jobs.append(job) + if "bulk_export_objects" in method: +# if method == "fba_tools/bulk_export_objects": + fba_tools_bulk_export_objects_jobs.append(job) + fba_tools_bulk_export_objects_job_count += 1 + downloaders_count += 1 + yearly_downloader_count += 1 + ws_obj_id = None + ws_obj_ids_list = list() + job_id = job["job_id"] + needs_to_be_added_to_the_db = 1 + if "fba_tools/bulk_export_objects" in method: + # need to get references differently as it is a bulk job + # need to loop over result list and then download_refs list in 'job_output': {'result': [{'downloaded_refs':[] + print("IN fba_tools.bulk_export_objects") + if "job_output" in job: + if job_id == "64d17b0f97c8caf1da9316ed ": + print("job id : " + str(job_id) + " Job output: " + str(job("job_output"))) + if "result" in job["job_output"]: + for job_result in job["job_output"]["result"]: + if 'downloaded_refs' in job_result: + ws_obj_ids_list.extend(job_result['downloaded_refs']) + models_bulk_dl_list.extend(job_result['downloaded_refs']) +#NEED TO CHANGE ALL THE WS_OBJ_LIST STUFF OVER TO LIST (see if empty etc). + if "kb_ObjectInfo" in method: + # need to find input ref differently + found_kb_info_ref = 0 + for param_key in job["job_input"]["params"][0]: + if "input_ref" in param_key: + ws_obj_ids_list.append(job["job_input"]["params"][0][param_key]) + kbObjectInfo_dict[job_id] = ws_obj_ids_list + found_kb_info_ref = 1 + #print("IN kbObjectInfo_dict checking ws_obj_id: " + str(ws_obj_ids_list)) + if found_kb_info_ref == 0: + print("######################") + print("UNABLE TO FIND kbinfo job_id : " + str(job_id)) + print("######################") + elif len(job["job_input"]['params']) > 0: + for param in job["job_input"]['params']: + if "input_ref" in param: + ws_obj_ids_list.append(param["input_ref"]) + + if len(ws_obj_ids_list) > 0: + downloaders_dict[method]["has_input_ref_count"] += 1 + #job_id = job["job_id"] + username = job["user"] + + if len(ws_obj_ids_list) > 1: + print("ws_obj_ids_list : " + str(ws_obj_ids_list)) + + for ws_obj_id in ws_obj_ids_list: + + + used_ws_obj_id = None + #print("ws_obj_id : " + ws_obj_id) + elements = ws_obj_id.split("/") + if len(elements) == 3: + if elements[0].isdigit() and elements[1].isdigit() and elements[2].isdigit(): + used_ws_obj_id = ws_obj_id + needs_to_be_added_to_the_db = 0 + else: + # had no cases of this as of this point will treat as orphaned? + # need to check at end to see if this code needs to be added. + print("Unexpected triplet ref format not with digits: " + ws_obj_id) + downloading_jobs_with_orphaned_refs_count += 1 + downloading_triples_not_digits_count += 1 + elif job_id in problem_refs_lookup and ws_obj_id in problem_refs_lookup[job_id]: + if problem_refs_lookup[job_id][ws_obj_id] is None: + # Do nothing can not resolve the correct id + continue + else: + used_ws_obj_id = problem_refs_lookup[job_id][ws_obj_id] + needs_to_be_added_to_the_db = 0 + else: + # THE incomplete Reference needs to be tried to be resolved and then inserted into the DB + if len(elements) == 2: + has_2_elements_count += 1 + #print("in elements == 2") + ws_id = None + obj_id = None + if elements[0].isdigit(): + # the ws_id is a number. it is good to go + ws_id = int(elements[0]) + else: + # means the ws is identified by name and not by id + # Need to search the worksaceObjects table to get the id. + # Note there is no mechanism for users to change this value + # There are no dupicate named workspaces other than null (which has 2) + workspaces_cursor = db.workspaces.find({"name":elements[0]},{"ws":1}); + for record in workspaces_cursor: + ws_id = int(record["ws"]) + #print("ws_id resolved: " + str(ws_id)) + if elements[1].isdigit(): + obj_id = int(elements[1]) + else: + #print("IN resolve object name") + # means the obj portion of the reference is identified by a name + # NOTE THIS NAME CAN BE CHANGED BY THE USER + # IF THE USER CHANGED THE NAME SINCE THE TIME OF THE DOWNLOAD + # THEN THAT REFERENCE IS ORPHANED + # Need to query the workspaceOBjects mongo collection + # using the name and ws_id to determine the object id + workspaceObjects_cursor = db.workspaceObjects.find({"name":elements[1],"ws":ws_id},{"id":1}); + #print("elements[1] = " + elements[1]) + #print("ws id : " + str(ws_id)) + #print("workspaceObjects_cursor" + str(workspaceObjects_cursor)) + for record in workspaceObjects_cursor: + #print("Found wsObjects record : " + str(record) ) + obj_id = int(record["id"]) + #print("ws_obj_id : " + ws_obj_id + " resolved to : " + str(obj_id)) + + if obj_id is not None and ws_id is not None: + # Need to do time machine to determine which object version was active + # at the time of the Downloading job start time + #print("Found input ref: " + ws_obj_id) + job_start_epoch = job["running"] / 1000 + #print("job_start_epoch : " + str(job_start_epoch)) + max_save_date_epoch = 0 + max_save_date_version = 0 + workspaceObjVersions_cursor = db.workspaceObjVersions.find({"ws": int(ws_id), "id": int(obj_id)}, + {"ws": 1, "id": 1, "ver": 1, "savedate": 1, "_id": 0}) + for record in workspaceObjVersions_cursor: + iso_savedate = record["savedate"] + iso_savedate_string = str(iso_savedate) + iso_savedate_string_elements = iso_savedate_string.split(".") + if len(iso_savedate_string_elements) == 1: + iso_savedate_string = iso_savedate_string + ".000000" + utc_dt = datetime.strptime(iso_savedate_string,'%Y-%m-%d %H:%M:%S.%f') + #'%Y-%m-%dT%H:%M:%S.%fZ') + savedate_epoch = (utc_dt - datetime(1970, 1, 1)).total_seconds() + #print("savedate_epoch : " + str(savedate_epoch)) + if (job_start_epoch > savedate_epoch and savedate_epoch > max_save_date_epoch): + max_save_date_epoch = savedate_epoch + max_save_date_version = record["ver"] + + + + + #if (max_save_date_version > 1): + # print("FINAL VERSION saved : " + str(max_save_date_version)) + used_ws_obj_id = str(ws_id) + "/" + str(obj_id) + "/" + str(max_save_date_version) + # print("used_ws_obj_id : " + used_ws_obj_id) + else: + # One of the ws_id or obj_id is None most likely means orphaned reference due to + #object name change + used_ws_obj_id = None + downloading_jobs_with_orphaned_refs_count += 1 + else: + print("WS OBJ ID was a different format then expected") + used_ws_obj_id = None + downloading_jobs_with_orphaned_refs_count += 1 + # END OF TRYING TO DETERMINE FULL WS_OBJ_ID + + # ENTER RECORD INTO DOWNLOADER_RESULTS + if used_ws_obj_id not in downloader_results: + downloader_results[used_ws_obj_id] = dict() + if username not in downloader_results[used_ws_obj_id]: + downloader_results[used_ws_obj_id][username] = list() + downloader_results[used_ws_obj_id][username].append(job_id) + + if needs_to_be_added_to_the_db == 1 : + #need to do insert + input = (ws_obj_id, used_ws_obj_id, job_id) + prep_cursor.execute(downloaders_problematic_obj_ids_insert_statement, input) + insert_prob_refs_count += 1 + + #downloader_results[ws_obj_id][job_id] = username + #downloader_results[used_ws_obj_id] +=1 + #downloader_results[ws_obj_id].add(username) + downloaders_with_ws_id_count += 1 + #if example_counter < 10: + # print("Example input_ws_obj_id : " + ws_obj_id + " resolved to used_ws_obj_id : " + used_ws_obj_id) + # example_counter += 1 + #else: + # print("EARLY EXIT: DOWNLOADER RESULTS : " + str(downloader_results)) + # exit() + else: + download_job_without_input_ref_count += 1 + downloaders_dict[method]["no_input_ref_count"] += 1 + finished_job_count += 1 + yearly_finished_count += 1 + print("Yearly downloader_count : " + str(yearly_downloader_count)) + print("Yearly finished_count : " + str(yearly_finished_count)) + print("Yearly download_job_without_input_ref_count : " + str(download_job_without_input_ref_count)) + print("Year to do end: " + str(year_to_do) + "_" + str(part_of_year) + " :: " + str(time.time() - yearly_start_time) + " seconds") + print("kbObjectInfo_dict : " + str(kbObjectInfo_dict)) + print("kbObjectInfo_dict len : " + str(len(kbObjectInfo_dict))) + + + print(str(downloaders_dict)) + +# i = 0 +# while i < 3: +# print("DataFileUtil_download_web_file_jobs number : " + str(i)) +# print(DataFileUtil_download_web_file_jobs[i]) +# i += 1 + +# i = 0 +# while i < 3: +# print("fba_tools_bulk_export_objects_jobs number : " + str(i)) +# print(fba_tools_bulk_export_objects_jobs[i]) +# i += 1 + +# i = -10 +# while i < 0: +# print("fba_tools_bulk_export_objects_jobs number : " + str(i)) +# print(fba_tools_bulk_export_objects_jobs[i]) +# i += 1 + + + print("TOTAL length of fba_tools_bulk_export_objects_jobs : " + str(len(fba_tools_bulk_export_objects_jobs))) + print("counter : " + str(fba_tools_bulk_export_objects_job_count)) +# print("DOWNLOADER RESULTS:") +# print(str(downloader_results)) +# loop_count = 0 + db_connection.commit() + print("Finished job count : " + str(finished_job_count)) + print("In If count : " + str(in_if_count)) + print("Downloaders job count : " + str(downloaders_count)) + + print("Downloaders with ws_id count : " + str(downloaders_with_ws_id_count)) + print("Has 2 elements count : has_2_elements_count : " + str(has_2_elements_count)) + print("FINAL DOWNLADER METHODS WITH AND WITHOUT INPUT REFS : ") + + print("insert_prob_refs_count : " + str(insert_prob_refs_count)) + + print("Downloaders_results length : " + str(len(downloader_results))) + return downloader_results + +def get_downloaders_lookup(): + + start_time = time.time() + main_function_start_time = time.time() + + downloaders_set = get_downloaders_set(cursor) + problem_refs_lookup = get_existing_problem_refs(cursor) + downloader_results = pull_downloading_jobs(downloaders_set, problem_refs_lookup) + print("--- Total TIME for building downloading lookups %s seconds ---" % (time.time() - start_time)) + return downloader_results + + + +#downloader_results = get_downloaders_lookup() +#print("pre bulk dl print") +#print("Length models_bulk_dl_list : " + str(len(models_bulk_dl_list))) +#for bulk_dl_id in models_bulk_dl_list: +# print("Bulk downloader id : " + str(bulk_dl_id) + " --- bulk_dl downloader result : " + str(downloader_results[bulk_dl_id])) +#print("post bulk dl print") +#i = 0 +#for downloader_key in downloader_results: +# print("Downloader key : " + str(downloader_key) + " downloader_results : " + str(downloader_results[downloader_key])) +# if i > 10: +# break +# i = i + 1 +#print("Downloader_results : " + str(downloader_results['49114/8/1'])) + + diff --git a/sql_create_statements/sql_reporting_views_and_tables.sql b/sql_create_statements/sql_reporting_views_and_tables.sql index 294cf9c..b8ffa0c 100644 --- a/sql_create_statements/sql_reporting_views_and_tables.sql +++ b/sql_create_statements/sql_reporting_views_and_tables.sql @@ -785,6 +785,29 @@ select max(record_date) as record_date, ws_id from metrics.workspaces w group by ws_id; +#IN METRICS +CREATE OR REPLACE table metrics.workspaces_current as +(select ws.* +from metrics.workspaces ws inner join +metrics.hv_workspaces_max_date wsmd +on ws.ws_id = wsmd.ws_id and +ws.record_date = wsmd.record_date); + +alter table metrics.workspaces_current +add unique (ws_id); + +#IN METRICS +CREATE OR REPLACE table metrics.workspaces_current_plus_users as +(select wc.* , bdws.orig_saver_count, bdws.non_orig_saver_count, bdws.orig_saver_size_GB, bdws.non_orig_saver_size_GB +from metrics.user_info ui +inner join metrics.workspaces_current wc on ui.username = wc.username +left outer join blobstore_detail_by_ws bdws on wc.ws_id = bdws.ws_id +where ui.kb_internal_user = 0 +and wc.narrative_version > 0 +and is_deleted = 0 +and is_temporary = 0); + + #IN METRICS_REPORTING CREATE OR REPLACE VIEW metrics_reporting.workspaces_current as select ws.* @@ -1638,6 +1661,8 @@ group by ws_id ); Query OK, 108871 rows affected (6 min 52.38 sec) Records: 108871 Duplicates: 0 Warnings: 0 +alter table blobstore_detail_by_ws add index (ws_id); + create or replace view blobstore_detail_by_ws_monthly as (select ws_id, DATE_FORMAT(`save_date`,'%Y-%m') as month, sum(orig_saver) as orig_saver_count, 0 - sum((orig_saver - 1)) as non_orig_saver_count, From 6e5762e3f97044fb4c7edb5790b78920f18d1460 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Tue, 8 Oct 2024 04:07:20 +0000 Subject: [PATCH 08/11] Added blobstore columns to WS report --- source/custom_scripts/dump_narratives_results.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/custom_scripts/dump_narratives_results.py b/source/custom_scripts/dump_narratives_results.py index 1bc14f2..7af4ebd 100644 --- a/source/custom_scripts/dump_narratives_results.py +++ b/source/custom_scripts/dump_narratives_results.py @@ -30,12 +30,17 @@ def dump_narratives_results(): # CHANGE QUERY HERE # Query for Adam Narratives dump of information: - query = ("select wc.* from metrics.user_info ui inner join metrics_reporting.workspaces_current wc on ui.username = wc.username " - "where ui.kb_internal_user = 0 and wc.narrative_version > 0 and is_deleted = 0 and is_temporary = 0") + query = ("select * from metrics.workspaces_current_plus_users ") +# query = ("select wc.* from metrics.user_info ui inner join metrics_reporting.workspaces_current wc on ui.username = wc.username " +# "where ui.kb_internal_user = 0 and wc.narrative_version > 0 and is_deleted = 0 and is_temporary = 0") # Headers for Adam's narratives query (Note if more columns added, may need to update this print("ws_id\tusername\tmod_date\tinitial_save_date\trecord_date\ttop_lvl_object_count\ttotal_object_count\tvisible_app_cells_count\tcode_cells_count\t" "narrative_version\thidden_object_count\tdeleted_object_count\ttotal_size\ttop_lvl_size\tis_public\tis_temporary\tis_deleted\tnumber_of_shares\t" - "num_nar_obj_ids\tstatic_narratives_count\tstatic_narratives_views\tunique_object_types_count") + "num_nar_obj_ids\tstatic_narratives_count\tstatic_narratives_views\tunique_object_types_count\t" + "orig_saver_count\tnon_orig_saver_count\torig_saver_size_GB\tnon_orig_saver_size_GB") + ) + +# "num_nar_obj_ids\tstatic_narratives_count\tstatic_narratives_views\tunique_object_types_count") cursor.execute(query) row_values = list() From 657a8affb50b78f3502b531447e4f2390477fdcf Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Tue, 8 Oct 2024 04:16:53 +0000 Subject: [PATCH 09/11] Added blobstore columns to WS report --- source/custom_scripts/dump_narratives_results.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/custom_scripts/dump_narratives_results.py b/source/custom_scripts/dump_narratives_results.py index 7af4ebd..1891bca 100644 --- a/source/custom_scripts/dump_narratives_results.py +++ b/source/custom_scripts/dump_narratives_results.py @@ -38,7 +38,6 @@ def dump_narratives_results(): "narrative_version\thidden_object_count\tdeleted_object_count\ttotal_size\ttop_lvl_size\tis_public\tis_temporary\tis_deleted\tnumber_of_shares\t" "num_nar_obj_ids\tstatic_narratives_count\tstatic_narratives_views\tunique_object_types_count\t" "orig_saver_count\tnon_orig_saver_count\torig_saver_size_GB\tnon_orig_saver_size_GB") - ) # "num_nar_obj_ids\tstatic_narratives_count\tstatic_narratives_views\tunique_object_types_count") From aa84b18f587f6941f55539236cc675d15340a7f9 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Thu, 10 Oct 2024 00:37:08 +0000 Subject: [PATCH 10/11] more git cleanup commits --- bin/dump_get_copy_info_for_narratives.sh | 3 +++ source/daily_cron_jobs/upload_app_stats.py | 4 ++-- source/daily_cron_jobs/upload_blobstore_details.py | 10 +++++----- source/daily_cron_jobs/upload_blobstore_stats.py | 4 ++-- .../upload_elasticsearch_usersmry_stats.py | 4 ++-- 5 files changed, 14 insertions(+), 11 deletions(-) create mode 100755 bin/dump_get_copy_info_for_narratives.sh diff --git a/bin/dump_get_copy_info_for_narratives.sh b/bin/dump_get_copy_info_for_narratives.sh new file mode 100755 index 0000000..55d639d --- /dev/null +++ b/bin/dump_get_copy_info_for_narratives.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python custom_scripts/get_copy_info_for_narratives.py diff --git a/source/daily_cron_jobs/upload_app_stats.py b/source/daily_cron_jobs/upload_app_stats.py index 5be45fa..4e90f96 100644 --- a/source/daily_cron_jobs/upload_app_stats.py +++ b/source/daily_cron_jobs/upload_app_stats.py @@ -7,8 +7,8 @@ print("############################################") print("App Stats Upload (UTC): " + str(datetime.datetime.utcnow())) start_time = time.time() -#start_date = "2021-06-01" -#end_date = "2022-07-20" +#start_date = "2023-07-27" +#end_date = "2023-08-01" #methods_upload_app_stats.upload_user_app_stats(start_date, end_date) methods_upload_app_stats.upload_user_app_stats() print("Uploading app stats took ", time.time() - start_time, " seconds to run") diff --git a/source/daily_cron_jobs/upload_blobstore_details.py b/source/daily_cron_jobs/upload_blobstore_details.py index a30d1c4..1f0e798 100644 --- a/source/daily_cron_jobs/upload_blobstore_details.py +++ b/source/daily_cron_jobs/upload_blobstore_details.py @@ -14,11 +14,11 @@ start_time = time.time() -start_date = "2024-09-07" -end_date = "2024-09-28" -methods_upload_blobstore_details.process_blobstore_details_data(start_date,end_date) -#methods_upload_blobstore_details.process_blobstore_details_data() -#print("Uploading blobstore details took ", time.time() - start_time, " seconds to run") +#start_date = "2024-09-07" +#end_date = "2024-10-28" +#methods_upload_blobstore_details.process_blobstore_details_data(start_date,end_date) +methods_upload_blobstore_details.process_blobstore_details_data() +print("Uploading blobstore details took ", time.time() - start_time, " seconds to run") start_date=datetime.datetime.combine(yesterday, datetime.datetime.min.time()) diff --git a/source/daily_cron_jobs/upload_blobstore_stats.py b/source/daily_cron_jobs/upload_blobstore_stats.py index 3f3fcc2..ba4b2c1 100644 --- a/source/daily_cron_jobs/upload_blobstore_stats.py +++ b/source/daily_cron_jobs/upload_blobstore_stats.py @@ -7,8 +7,8 @@ print("############################################") print("Blobstore Stats Upload (UTC): " + str(datetime.datetime.utcnow())) start_time = time.time() -#start_date = "2021-09-10" -#end_date = "2021-10-31" +#start_date = "2023-07-27" +#end_date = "2023-08-01" #methods_upload_blobstore_stats.process_blobstore_stats_data(start_date,end_date) methods_upload_blobstore_stats.process_blobstore_stats_data() print("Uploading blobstore stats took ", time.time() - start_time, " seconds to run") diff --git a/source/daily_cron_jobs/upload_elasticsearch_usersmry_stats.py b/source/daily_cron_jobs/upload_elasticsearch_usersmry_stats.py index 881b0d6..40013bf 100644 --- a/source/daily_cron_jobs/upload_elasticsearch_usersmry_stats.py +++ b/source/daily_cron_jobs/upload_elasticsearch_usersmry_stats.py @@ -13,8 +13,8 @@ start_time = time.time() # start_date = "month-day-year" # end_date = "month-day-year" -#start_date = "10-10-2021" -#end_date = "10-28-2021" +#start_date = "07-27-2023" +#end_date = "08-01-2021" #return_capture = methods_upload_elasticsearch_sumrydicts.elastic_summary_dictionaries( # start_date, end_date #) From 11ef6dbd4f3987d53b5d3a4e30591526bbb8b815 Mon Sep 17 00:00:00 2001 From: Jason Baumohl Date: Thu, 10 Oct 2024 00:55:24 +0000 Subject: [PATCH 11/11] master cron job including blobstore_details --- bin/master_cron_shell.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/master_cron_shell.sh b/bin/master_cron_shell.sh index 2f05b37..ef60061 100755 --- a/bin/master_cron_shell.sh +++ b/bin/master_cron_shell.sh @@ -14,6 +14,8 @@ python daily_cron_jobs/upload_public_narratives_count.py python daily_cron_jobs/upload_user_orcid_count.py +python daily_cron_jobs/upload_blobstore_details.py + python daily_cron_jobs/make_reporting_tables.py