Skip to content

Commit

Permalink
Cache earlier years to reduce db size
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisjwood16 committed Nov 19, 2024
1 parent 64db18c commit d63af5a
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 4 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,4 @@ __pycache__/

# Linux trash directories
.Trash-*/
/data
TRUD.secrets
/measures_to_test
notebooks/new_bnf_codes.csv
33 changes: 32 additions & 1 deletion src/bsa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,28 @@ def check_cache(self):
# Return a list of RESOURCE_FROM values
return [row[0] for row in result]

def check_pre_cache(self):
"""Check all EPD_pre RESOURCE_FROM entries in the cache."""
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT DISTINCT RESOURCE_FROM FROM cache WHERE RESOURCE_FROM LIKE 'EPD_pre_%'
''')
result = cursor.fetchall()
# Return a list of RESOURCE_FROM values
return [row[0] for row in result]

def return_pre_cache_year(self):
"""Check all EPD_pre RESOURCE_FROM entries in the cache and get the year."""
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT DISTINCT RESOURCE_FROM FROM cache WHERE RESOURCE_FROM LIKE 'EPD_pre_%'
''')
result = cursor.fetchall()
result = result[0][0].split('_')[2]
return result

def fetch_cache(self, resource_from_list):
"""Fetch data from the cache where RESOURCE_FROM is in the provided list and remove duplicate rows."""
query = '''
Expand Down Expand Up @@ -181,8 +203,17 @@ def resource_name_list_filter(self):
(self.resources_table['date'] >= self.resource_from) &
(self.resources_table['date'] <= self.resource_to)
]

pre_cache_year = int(CACHE_MANAGER_OBJ.return_pre_cache_year())

# Create the new column 'modified_table_name'
filtered_df['modified_table_name'] = filtered_df.apply(
lambda row: 'EPD_pre_2024' if row['date'].year < pre_cache_year else row['bq_table_name'],
axis=1
)

self.resource_name_list = filtered_df['bq_table_name'].tolist()
#self.resource_name_list = filtered_df['bq_table_name'].tolist()
self.resource_name_list = list(set(filtered_df['modified_table_name'].tolist()))
self.date_list = filtered_df['date'].tolist()

def return_date_list(self):
Expand Down
48 changes: 48 additions & 0 deletions src/db_consolidate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import sqlite3

# Parameters
db_path = "cache_db.sqlite"
year_to_exclude = 2024 # Set the year dynamically

# Connect to the SQLite database
connection = sqlite3.connect(db_path)
cursor = connection.cursor()

# Count total number of rows
cursor.execute("SELECT COUNT(*) FROM cache;")
total_rows = cursor.fetchone()[0]
print(f"Total number of rows: {total_rows}")

# Update RESOURCE_FROM for rows not matching the dynamic year
update_query = f"""
UPDATE cache
SET RESOURCE_FROM = 'EPD_pre_{year_to_exclude}'
WHERE RESOURCE_FROM NOT LIKE 'EPD_{year_to_exclude}%';
"""
cursor.execute(update_query)
connection.commit()
print(f"Number of rows updated: {cursor.rowcount}")

# Remove duplicate rows
remove_duplicates_query = """
DELETE FROM cache
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM cache
GROUP BY RESOURCE_FROM, BNF_CODE, BNF_DESCRIPTION, CHEMICAL_SUBSTANCE_BNF_DESCR
);
"""

cursor.execute(remove_duplicates_query)
connection.commit()
print(f"Number of duplicate rows removed: {cursor.rowcount}")

# Count total number of rows
cursor.execute("SELECT COUNT(*) FROM cache;")
total_rows = cursor.fetchone()[0]
print(f"Total number of rows: {total_rows}")

# Vacuum the database to reclaim space
print("Running VACUUM to reclaim space...")
cursor.execute("VACUUM;")
connection.close()

0 comments on commit d63af5a

Please sign in to comment.