From ae9d20160f70a9f5bf8393754becd4ab0d379bbf Mon Sep 17 00:00:00 2001 From: William Brannon Date: Tue, 24 Sep 2024 07:02:01 +0000 Subject: [PATCH] parametrize target directory --- images/comments/run.py | 39 +++++++++++++------ images/sentiment-topic/sentiment.py | 17 ++++++-- images/sentiment-topic/topic-embeds.py | 26 +++++++++---- .../sentiment-topic/topic-hdbscan-predict.py | 11 ++++-- images/sentiment-topic/topic-hdbscan-train.py | 7 +++- images/sentiment-topic/topic-umap.py | 14 ++++--- images/streamlit/app.py | 4 +- images/streamlit/file_cache.py | 6 ++- images/streamlit/main.py | 6 ++- images/streamlit/refresh.py | 8 +++- images/streamlit/utils.py | 9 +++++ 11 files changed, 105 insertions(+), 42 deletions(-) diff --git a/images/comments/run.py b/images/comments/run.py index 62b2007..4b2fe0b 100755 --- a/images/comments/run.py +++ b/images/comments/run.py @@ -38,7 +38,10 @@ def __init__(self, channel_id, outdir='data', cached_videos=False, self.youtube_api_key = os.environ['YOUTUBE_API_KEY'] self.api = build('youtube', 'v3', developerKey=self.youtube_api_key) - os.makedirs(self.comments_path, exist_ok=True) + try: + os.makedirs(self.comments_path, exist_ok=True) + except PermissionError: # NFS + pass @property def video_path(self): @@ -177,15 +180,31 @@ def run(self): def parse_args(): parser = argparse.ArgumentParser(description='Fetch YouTube video/comment data') - parser.add_argument('channel_id', nargs='?', default=None, help='YouTube Channel ID') + parser.add_argument('channel_id', nargs='?', default=None, + help='YouTube Channel ID') + parser.add_argument('--outdir', '-o', default=None, + help='Output directory') + + parser.add_argument('--cached_videos', '-c', action='store_true', + help='Avoid fetching new videos (comments only)') + parser.add_argument('--verbose', '-d', action='store_true', + help='Debug output') + parser.add_argument('--progress', '-p', action='store_true', + help='Progress bar') + parser.add_argument('--new-threshold-days', '-n', default=None, + help='Always refresh comments for videos newer than this (in days)') - parser.add_argument('--cached_videos', '-c', action='store_true', help='Avoid fetching new videos (comments only)') - parser.add_argument('--outdir', '-o', default='data', help='Output directory') - parser.add_argument('--verbose', '-d', action='store_true', help='Debug output') - parser.add_argument('--progress', '-p', action='store_true', help='Progress bar') - parser.add_argument('--new-threshold-days', '-n', default=None, help='Always refresh comments for videos newer than this (in days)') + args = parser.parse_args() + + if args.channel_id is None: + assert 'CHANNEL_ID' in os.environ.keys() + setattr(args, 'channel_id', os.environ.get('CHANNEL_ID')) - return parser.parse_args() + if args.outdir is None: + outdir = os.getenv('DATA_DIR', 'data') + setattr(args, 'outdir', outdir) + + return args if __name__ == '__main__': @@ -193,8 +212,4 @@ def parse_args(): ut.log_setup('DEBUG' if args.pop('verbose') else None) - if args['channel_id'] is None: - assert 'CHANNEL_ID' in os.environ.keys() - args['channel_id'] = os.environ.get('CHANNEL_ID') - ChannelFetch(**args).run() diff --git a/images/sentiment-topic/sentiment.py b/images/sentiment-topic/sentiment.py index 9615a43..882c1eb 100755 --- a/images/sentiment-topic/sentiment.py +++ b/images/sentiment-topic/sentiment.py @@ -43,7 +43,10 @@ def __init__(self, data, output_dir, outfile_name='sentiment-scores.csv', self.autocast = autocast self.data_parallel = data_parallel - os.makedirs(output_dir, exist_ok=True) + try: + os.makedirs(output_dir, exist_ok=True) + except PermissionError: # NFS + pass self._cache = self._load_cache() @@ -160,10 +163,16 @@ def process(self): seed = int(os.environ.get('SEED', '42')) ut.seed_everything(seed) - os.makedirs('data', exist_ok=True) + data_dir = os.getenv('DATA_DIR', 'data') + output_dir = os.path.join(data_dir, 'comments') + + try: + os.makedirs(data_dir, exist_ok=True) + except PermissionError: # NFS + pass data = load_comments_from_json( - rootpath='data/', + rootpath=data_dir, channel_id=os.getenv('CHANNEL_ID', None), playlist_id=os.getenv('PLAYLIST_ID', None), ) @@ -171,4 +180,4 @@ def process(self): for d in data: d.pop('video_id') - SentimentAnalyzer(data=data, output_dir='data/comments/').process() + SentimentAnalyzer(data=data, output_dir=output_dir).process() diff --git a/images/sentiment-topic/topic-embeds.py b/images/sentiment-topic/topic-embeds.py index 09c4553..07d7c86 100755 --- a/images/sentiment-topic/topic-embeds.py +++ b/images/sentiment-topic/topic-embeds.py @@ -42,8 +42,15 @@ def __init__(self, data, output_dir, cache_dir='sentence-embeds-cache', self.sort_length = sort_length self.autocast = autocast - os.makedirs(output_dir, exist_ok=True) - os.makedirs(self._cache_path, exist_ok=True) + try: + os.makedirs(output_dir, exist_ok=True) + except PermissionError: # NFS + pass + + try: + os.makedirs(self._cache_path, exist_ok=True) + except PermissionError: + pass self._cached = self._load_cache_state() self.data = [r for r in self.data if not self._is_cached(r['id'])] @@ -190,10 +197,16 @@ def process(self): seed = int(os.environ.get('SEED', '42')) ut.seed_everything(seed) - os.makedirs('data', exist_ok=True) + data_dir = os.getenv('DATA_DIR', 'data') + output_dir = os.path.join(data_dir, 'comment-topics') + + try: + os.makedirs(data_dir, exist_ok=True) + except PermissionError: # NFS + pass data = load_comments_from_json( - rootpath='data/', + rootpath=data_dir, channel_id=os.getenv('CHANNEL_ID', None), playlist_id=os.getenv('PLAYLIST_ID', None), full_only=True, @@ -202,7 +215,4 @@ def process(self): for d in data: d.pop('video_id') - SentenceEmbedder( - data=data, - output_dir='data/comment-topics/', - ).process() + SentenceEmbedder(data=data, output_dir=output_dir).process() diff --git a/images/sentiment-topic/topic-hdbscan-predict.py b/images/sentiment-topic/topic-hdbscan-predict.py index 07decfb..395ebcf 100755 --- a/images/sentiment-topic/topic-hdbscan-predict.py +++ b/images/sentiment-topic/topic-hdbscan-predict.py @@ -20,14 +20,19 @@ seed = int(os.environ.get('SEED', '42')) ut.seed_everything(seed) - with open('data/comment-topics/umap-embeds-50d.npy', 'rb') as f: + data_dir = os.getenv('DATA_DIR', 'data') + + path_u50d = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy') + with open(path_u50d, 'rb') as f: umap_embeds_50d = np.load(f) - with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'rb') as f: + path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl') + with open(path_c50d, 'rb') as f: clusterer = pickle.load(f) labels = approximate_predict(clusterer, umap_embeds_50d)[0] with ut.DelayedKeyboardInterrupt(): - with open('data/comment-topics/hdbscan-labels-umap-50d.npy', 'wb') as f: + path_l50d = os.path.join(data_dir, 'comment-topics/hdbscan-labels-umap-50d.npy') + with open(path_l50d, 'wb') as f: np.save(f, labels) diff --git a/images/sentiment-topic/topic-hdbscan-train.py b/images/sentiment-topic/topic-hdbscan-train.py index 22fd058..484741d 100755 --- a/images/sentiment-topic/topic-hdbscan-train.py +++ b/images/sentiment-topic/topic-hdbscan-train.py @@ -20,7 +20,9 @@ seed = int(os.environ.get('SEED', '42')) ut.seed_everything(seed) - umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy' + data_dir = os.getenv('DATA_DIR', 'data') + + umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy') with open(umap_embeds_50d_path, 'rb') as f: umap_embeds_50d = np.load(f) @@ -38,5 +40,6 @@ clusterer.fit(umap_embeds_50d) with ut.DelayedKeyboardInterrupt(): - with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'wb') as f: + path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl') + with open(path_c50d, 'wb') as f: pickle.dump(clusterer, f) diff --git a/images/sentiment-topic/topic-umap.py b/images/sentiment-topic/topic-umap.py index 40bd6e8..c029f54 100755 --- a/images/sentiment-topic/topic-umap.py +++ b/images/sentiment-topic/topic-umap.py @@ -24,12 +24,14 @@ seed = int(os.environ.get('SEED', '42')) ut.seed_everything(seed) + data_dir = os.getenv('DATA_DIR', 'data') + # # Sample to train on # sample = load_comments_from_json( - rootpath='data/', + rootpath=data_dir, channel_id=os.getenv('CHANNEL_ID', None), playlist_id=os.getenv('PLAYLIST_ID', None), ) @@ -42,20 +44,21 @@ .tolist() sample = set(sample) - ids = pd.read_csv('data/comment-topics/sentence-embeds-ids.csv')['id'] + embeds_ids_path = os.path.join(data_dir, 'comment-topics/sentence-embeds-ids.csv') + ids = pd.read_csv(embeds_ids_path)['id'] train_mask = np.asarray([i in sample for i in ids.to_numpy().tolist()]) logger.info(f'Training on {train_mask.sum()} samples') - ids.loc[train_mask].to_csv('data/comment-topics/umap-hdbscan-sample-ids.csv', index=False) + sample_ids_path = os.path.join(data_dir, 'comment-topics/umap-hdbscan-sample-ids.csv') + ids.loc[train_mask].to_csv(sample_ids_path, index=False) # # 50d UMAP # logger.info('50d UMAP') - umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy' - embeds_file = 'data/comment-topics/sentence-embeds.pt' + embeds_file = os.path.join(data_dir, 'comment-topics/sentence-embeds.pt') with open(embeds_file, 'rb') as obj: embeds = torch.load(obj, 'cpu').float().numpy()[train_mask, ...] @@ -76,5 +79,6 @@ umap_embeds_50d = umap_model_50d.transform(embeds) with ut.DelayedKeyboardInterrupt(): + umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy') with open(umap_embeds_50d_path, 'wb') as f: np.save(f, umap_embeds_50d) diff --git a/images/streamlit/app.py b/images/streamlit/app.py index 0e0ad28..9c6b4d1 100644 --- a/images/streamlit/app.py +++ b/images/streamlit/app.py @@ -32,9 +32,7 @@ logger = logging.getLogger(__name__) - -DB_PATH = 'data/youtube.db' -SUMMARY_SUGGEST_CACHE_DIR = './data/summary-suggest-gpt' +DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths() def stmt_to_pandas(stmt, index_col=None): diff --git a/images/streamlit/file_cache.py b/images/streamlit/file_cache.py index 023eed6..01dfbb2 100644 --- a/images/streamlit/file_cache.py +++ b/images/streamlit/file_cache.py @@ -8,7 +8,11 @@ def __init__(self, cache_dir=None, **kwargs): _cache_keys = [] if cache_dir is not None: - os.makedirs(cache_dir, exist_ok=True) + try: + os.makedirs(cache_dir, exist_ok=True) + except PermissionError: # NFS + pass + _cache_keys += os.listdir(cache_dir) self.cache_dir = cache_dir diff --git a/images/streamlit/main.py b/images/streamlit/main.py index c1e396e..e15325f 100644 --- a/images/streamlit/main.py +++ b/images/streamlit/main.py @@ -6,12 +6,14 @@ import utils as ut from app import run - logger = logging.getLogger(__name__) if __name__ == '__main__': - os.makedirs('data', exist_ok=True) + try: + os.makedirs(ut.get_data_paths[0], exist_ok=True) + except PermissionError: # NFS + pass ut.log_setup() ut.seed_everything() diff --git a/images/streamlit/refresh.py b/images/streamlit/refresh.py index 0334893..1b529e0 100755 --- a/images/streamlit/refresh.py +++ b/images/streamlit/refresh.py @@ -5,7 +5,6 @@ import utils as ut from models import get_db -from app import DB_PATH, SUMMARY_SUGGEST_CACHE_DIR from video_text2text import VideoSummary, VideoSuggestions, ClusterShortName @@ -13,7 +12,12 @@ if __name__ == '__main__': - os.makedirs('data', exist_ok=True) + DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths() + + try: + os.makedirs(DATA_DIR, exist_ok=True) + except PermissionError: # NFS + pass ut.log_setup() ut.seed_everything() diff --git a/images/streamlit/utils.py b/images/streamlit/utils.py index f6ffbf8..8d33c41 100644 --- a/images/streamlit/utils.py +++ b/images/streamlit/utils.py @@ -39,6 +39,15 @@ def seed_everything(): np.random.seed(seed) +def get_data_paths(): + data_dir = os.getenv('DATA_DIR', 'data') + + db_path = os.path.join(data_dir, 'youtube.db') + cache_dir = os.path.join(data_dir, 'summary-suggest-gpt') + + return db_path, cache_dir + + def remove_punctuation(text): for punct in string.punctuation: if punct == "'":