Skip to content

Commit

Permalink
split backfill to file creation then copy from files
Browse files Browse the repository at this point in the history
  • Loading branch information
yuenmichelle1 committed Oct 17, 2023
1 parent 73cfae1 commit f8fa8e3
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 16 deletions.
27 changes: 27 additions & 0 deletions scripts/copy_classifications_from_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import psycopg
from datetime import datetime
import math

TIMESCALE_CONNECTION = os.getenv('TIMESCALE_CONNECTION')
TIMESCALE_PORT = os.getenv('TIMESCALE_PORT')
ERAS_DB = os.getenv('ERAS_DB')
ERAS_USER = os.getenv('ERAS_USER')
ERAS_PW = os.getenv('ERAS_PW')
FIRST_INGESTED_CLASSIFICATION_ID = os.getenv('FIRST_INGESTED_CLASSIFICATION_ID')

limit = 10000000
num_files = math.ceil(FIRST_INGESTED_CLASSIFICATION_ID/limit)

start_time = datetime.now()
print("TIMESCALE START COPY FROM CSV BEFORE TIME =", start_time)

output_file_no = 0
while output_file_no <= num_files:
with psycopg.connect(f"host={TIMESCALE_CONNECTION} port={TIMESCALE_PORT} dbname={ERAS_DB} user={ERAS_USER} password={ERAS_PW} sslmode=require") as timescale_db_conn:
with timescale_db_conn.cursor(name="timescale_cursor").copy("COPY classification_events FROM STDIN DELIMITER ',' CSV HEADER") as timescale_copy:
timescale_copy.write(open(f"prod_classifications_{output_file_no}.csv").read())
output_file_no += 1

finish_time = datetime.now()
print("CLASSIFICATIONS TIMESCALE backfill AFTER Time =", finish_time)
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
PANOPTES_DB = os.getenv('PANOPTES_DB')
PANOPTES_USER = os.getenv('PANOPTES_USER')
PANOPTES_PW = os.getenv('PANOPTES_PW')
TIMESCALE_CONNECTION = os.getenv('TIMESCALE_CONNECTION')
TIMESCALE_PORT = os.getenv('TIMESCALE_PORT')
ERAS_DB = os.getenv('ERAS_DB')
ERAS_USER = os.getenv('ERAS_USER')
ERAS_PW = os.getenv('ERAS_PW')
FIRST_INGESTED_CLASSIFICATION_ID = os.getenv('FIRST_INGESTED_CLASSIFICATION_ID')

start_time = datetime.now()
Expand Down Expand Up @@ -41,16 +36,5 @@

finish_copy_time = datetime.now()
print("PANOPTES Classification Copy over at ", finish_copy_time)
print("TIMESCALE START COPY FROM CSV")

output_file_no = 0
while output_file_no <= num_files:
with psycopg.connect(f"host={TIMESCALE_CONNECTION} port={TIMESCALE_PORT} dbname={ERAS_DB} user={ERAS_USER} password={ERAS_PW} sslmode=require") as timescale_db_conn:
with timescale_db_conn.cursor(name="timescale_cursor").copy("COPY classification_events FROM STDIN DELIMITER ',' CSV HEADER") as timescale_copy:
timescale_copy.write(open(f"prod_classifications_{output_file_no}.csv").read())
output_file_no += 1

finish_time = datetime.now()
print("CLASSIFICATIONS TIMESCALE backfill AFTER Time =", finish_time)


0 comments on commit f8fa8e3

Please sign in to comment.