parametrize target directory

mit-ccc · Sep 24, 2024 · ae9d201 · ae9d201
1 parent eff69b1
commit ae9d201
Show file tree

Hide file tree

Showing 11 changed files with 105 additions and 42 deletions.
diff --git a/images/comments/run.py b/images/comments/run.py
@@ -38,7 +38,10 @@ def __init__(self, channel_id, outdir='data', cached_videos=False,
         self.youtube_api_key = os.environ['YOUTUBE_API_KEY']
         self.api = build('youtube', 'v3', developerKey=self.youtube_api_key)
 
-        os.makedirs(self.comments_path, exist_ok=True)
+        try:
+            os.makedirs(self.comments_path, exist_ok=True)
+        except PermissionError:  # NFS
+            pass
 
     @property
     def video_path(self):
@@ -177,24 +180,36 @@ def run(self):
 def parse_args():
     parser = argparse.ArgumentParser(description='Fetch YouTube video/comment data')
 
-    parser.add_argument('channel_id', nargs='?', default=None, help='YouTube Channel ID')
+    parser.add_argument('channel_id', nargs='?', default=None,
+                        help='YouTube Channel ID')
+    parser.add_argument('--outdir', '-o', default=None,
+                        help='Output directory')
+
+    parser.add_argument('--cached_videos', '-c', action='store_true',
+                        help='Avoid fetching new videos (comments only)')
+    parser.add_argument('--verbose', '-d', action='store_true',
+                        help='Debug output')
+    parser.add_argument('--progress', '-p', action='store_true',
+                        help='Progress bar')
+    parser.add_argument('--new-threshold-days', '-n', default=None,
+                        help='Always refresh comments for videos newer than this (in days)')
 
-    parser.add_argument('--cached_videos', '-c', action='store_true', help='Avoid fetching new videos (comments only)')
-    parser.add_argument('--outdir', '-o', default='data', help='Output directory')
-    parser.add_argument('--verbose', '-d', action='store_true', help='Debug output')
-    parser.add_argument('--progress', '-p', action='store_true', help='Progress bar')
-    parser.add_argument('--new-threshold-days', '-n', default=None, help='Always refresh comments for videos newer than this (in days)')
+    args = parser.parse_args()
+
+    if args.channel_id is None:
+        assert 'CHANNEL_ID' in os.environ.keys()
+        setattr(args, 'channel_id', os.environ.get('CHANNEL_ID'))
 
-    return parser.parse_args()
+    if args.outdir is None:
+        outdir = os.getenv('DATA_DIR', 'data')
+        setattr(args, 'outdir', outdir)
+
+    return args
 
 
 if __name__ == '__main__':
     args = vars(parse_args())
 
     ut.log_setup('DEBUG' if args.pop('verbose') else None)
 
-    if args['channel_id'] is None:
-        assert 'CHANNEL_ID' in os.environ.keys()
-        args['channel_id'] = os.environ.get('CHANNEL_ID')
-
     ChannelFetch(**args).run()
diff --git a/images/sentiment-topic/sentiment.py b/images/sentiment-topic/sentiment.py
@@ -43,7 +43,10 @@ def __init__(self, data, output_dir, outfile_name='sentiment-scores.csv',
         self.autocast = autocast
         self.data_parallel = data_parallel
 
-        os.makedirs(output_dir, exist_ok=True)
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+        except PermissionError:  # NFS
+            pass
 
         self._cache = self._load_cache()
 
@@ -160,15 +163,21 @@ def process(self):
     seed = int(os.environ.get('SEED', '42'))
     ut.seed_everything(seed)
 
-    os.makedirs('data', exist_ok=True)
+    data_dir = os.getenv('DATA_DIR', 'data')
+    output_dir = os.path.join(data_dir, 'comments')
+
+    try:
+        os.makedirs(data_dir, exist_ok=True)
+    except PermissionError:  # NFS
+        pass
 
     data = load_comments_from_json(
-        rootpath='data/',
+        rootpath=data_dir,
         channel_id=os.getenv('CHANNEL_ID', None),
         playlist_id=os.getenv('PLAYLIST_ID', None),
     )
 
     for d in data:
         d.pop('video_id')
 
-    SentimentAnalyzer(data=data, output_dir='data/comments/').process()
+    SentimentAnalyzer(data=data, output_dir=output_dir).process()
diff --git a/images/sentiment-topic/topic-embeds.py b/images/sentiment-topic/topic-embeds.py
@@ -42,8 +42,15 @@ def __init__(self, data, output_dir, cache_dir='sentence-embeds-cache',
         self.sort_length = sort_length
         self.autocast = autocast
 
-        os.makedirs(output_dir, exist_ok=True)
-        os.makedirs(self._cache_path, exist_ok=True)
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+        except PermissionError:  # NFS
+            pass
+
+        try:
+            os.makedirs(self._cache_path, exist_ok=True)
+        except PermissionError:
+            pass
 
         self._cached = self._load_cache_state()
         self.data = [r for r in self.data if not self._is_cached(r['id'])]
@@ -190,10 +197,16 @@ def process(self):
     seed = int(os.environ.get('SEED', '42'))
     ut.seed_everything(seed)
 
-    os.makedirs('data', exist_ok=True)
+    data_dir = os.getenv('DATA_DIR', 'data')
+    output_dir = os.path.join(data_dir, 'comment-topics')
+
+    try:
+        os.makedirs(data_dir, exist_ok=True)
+    except PermissionError:  # NFS
+        pass
 
     data = load_comments_from_json(
-        rootpath='data/',
+        rootpath=data_dir,
         channel_id=os.getenv('CHANNEL_ID', None),
         playlist_id=os.getenv('PLAYLIST_ID', None),
         full_only=True,
@@ -202,7 +215,4 @@ def process(self):
     for d in data:
         d.pop('video_id')
 
-    SentenceEmbedder(
-        data=data,
-        output_dir='data/comment-topics/',
-    ).process()
+    SentenceEmbedder(data=data, output_dir=output_dir).process()
diff --git a/images/sentiment-topic/topic-hdbscan-predict.py b/images/sentiment-topic/topic-hdbscan-predict.py
@@ -20,14 +20,19 @@
     seed = int(os.environ.get('SEED', '42'))
     ut.seed_everything(seed)
 
-    with open('data/comment-topics/umap-embeds-50d.npy', 'rb') as f:
+    data_dir = os.getenv('DATA_DIR', 'data')
+
+    path_u50d = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
+    with open(path_u50d, 'rb') as f:
         umap_embeds_50d = np.load(f)
 
-    with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'rb') as f:
+    path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl')
+    with open(path_c50d, 'rb') as f:
         clusterer = pickle.load(f)
 
     labels = approximate_predict(clusterer, umap_embeds_50d)[0]
 
     with ut.DelayedKeyboardInterrupt():
-        with open('data/comment-topics/hdbscan-labels-umap-50d.npy', 'wb') as f:
+        path_l50d = os.path.join(data_dir, 'comment-topics/hdbscan-labels-umap-50d.npy')
+        with open(path_l50d, 'wb') as f:
             np.save(f, labels)
diff --git a/images/sentiment-topic/topic-hdbscan-train.py b/images/sentiment-topic/topic-hdbscan-train.py
@@ -20,7 +20,9 @@
     seed = int(os.environ.get('SEED', '42'))
     ut.seed_everything(seed)
 
-    umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy'
+    data_dir = os.getenv('DATA_DIR', 'data')
+
+    umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
     with open(umap_embeds_50d_path, 'rb') as f:
         umap_embeds_50d = np.load(f)
 
@@ -38,5 +40,6 @@
     clusterer.fit(umap_embeds_50d)
 
     with ut.DelayedKeyboardInterrupt():
-        with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'wb') as f:
+        path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl')
+        with open(path_c50d, 'wb') as f:
             pickle.dump(clusterer, f)
diff --git a/images/sentiment-topic/topic-umap.py b/images/sentiment-topic/topic-umap.py
@@ -24,12 +24,14 @@
     seed = int(os.environ.get('SEED', '42'))
     ut.seed_everything(seed)
 
+    data_dir = os.getenv('DATA_DIR', 'data')
+
     #
     # Sample to train on
     #
 
     sample = load_comments_from_json(
-        rootpath='data/',
+        rootpath=data_dir,
         channel_id=os.getenv('CHANNEL_ID', None),
         playlist_id=os.getenv('PLAYLIST_ID', None),
     )
@@ -42,20 +44,21 @@
         .tolist()
     sample = set(sample)
 
-    ids = pd.read_csv('data/comment-topics/sentence-embeds-ids.csv')['id']
+    embeds_ids_path = os.path.join(data_dir, 'comment-topics/sentence-embeds-ids.csv')
+    ids = pd.read_csv(embeds_ids_path)['id']
     train_mask = np.asarray([i in sample for i in ids.to_numpy().tolist()])
     logger.info(f'Training on {train_mask.sum()} samples')
 
-    ids.loc[train_mask].to_csv('data/comment-topics/umap-hdbscan-sample-ids.csv', index=False)
+    sample_ids_path = os.path.join(data_dir, 'comment-topics/umap-hdbscan-sample-ids.csv')
+    ids.loc[train_mask].to_csv(sample_ids_path, index=False)
 
     #
     # 50d UMAP
     #
 
     logger.info('50d UMAP')
-    umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy'
 
-    embeds_file = 'data/comment-topics/sentence-embeds.pt'
+    embeds_file = os.path.join(data_dir, 'comment-topics/sentence-embeds.pt')
     with open(embeds_file, 'rb') as obj:
         embeds = torch.load(obj, 'cpu').float().numpy()[train_mask, ...]
 
@@ -76,5 +79,6 @@
     umap_embeds_50d = umap_model_50d.transform(embeds)
 
     with ut.DelayedKeyboardInterrupt():
+        umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
         with open(umap_embeds_50d_path, 'wb') as f:
             np.save(f, umap_embeds_50d)
diff --git a/images/streamlit/app.py b/images/streamlit/app.py
@@ -32,9 +32,7 @@
 
 logger = logging.getLogger(__name__)
 
-
-DB_PATH = 'data/youtube.db'
-SUMMARY_SUGGEST_CACHE_DIR = './data/summary-suggest-gpt'
+DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths()
 
 
 def stmt_to_pandas(stmt, index_col=None):

diff --git a/images/streamlit/file_cache.py b/images/streamlit/file_cache.py
@@ -8,7 +8,11 @@ def __init__(self, cache_dir=None, **kwargs):
         _cache_keys = []
 
         if cache_dir is not None:
-            os.makedirs(cache_dir, exist_ok=True)
+            try:
+                os.makedirs(cache_dir, exist_ok=True)
+            except PermissionError:  # NFS
+                pass
+
             _cache_keys += os.listdir(cache_dir)
 
         self.cache_dir = cache_dir

diff --git a/images/streamlit/main.py b/images/streamlit/main.py
@@ -6,12 +6,14 @@
 import utils as ut
 from app import run
 
-
 logger = logging.getLogger(__name__)
 
 
 if __name__ == '__main__':
-    os.makedirs('data', exist_ok=True)
+    try:
+        os.makedirs(ut.get_data_paths[0], exist_ok=True)
+    except PermissionError:  # NFS
+        pass
 
     ut.log_setup()
     ut.seed_everything()

diff --git a/images/streamlit/refresh.py b/images/streamlit/refresh.py
@@ -5,15 +5,19 @@
 
 import utils as ut
 from models import get_db
-from app import DB_PATH, SUMMARY_SUGGEST_CACHE_DIR
 from video_text2text import VideoSummary, VideoSuggestions, ClusterShortName
 
 
 logger = logging.getLogger(__name__)
 
 
 if __name__ == '__main__':
-    os.makedirs('data', exist_ok=True)
+    DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths()
+
+    try:
+        os.makedirs(DATA_DIR, exist_ok=True)
+    except PermissionError:  # NFS
+        pass
 
     ut.log_setup()
     ut.seed_everything()

diff --git a/images/streamlit/utils.py b/images/streamlit/utils.py
@@ -39,6 +39,15 @@ def seed_everything():
     np.random.seed(seed)
 
 
+def get_data_paths():
+    data_dir = os.getenv('DATA_DIR', 'data')
+
+    db_path = os.path.join(data_dir, 'youtube.db')
+    cache_dir = os.path.join(data_dir, 'summary-suggest-gpt')
+
+    return db_path, cache_dir
+
+
 def remove_punctuation(text):
     for punct in string.punctuation:
         if punct == "'":