Skip to content

Commit

Permalink
parametrize target directory
Browse files Browse the repository at this point in the history
  • Loading branch information
wwbrannon committed Sep 24, 2024
1 parent eff69b1 commit ae9d201
Show file tree
Hide file tree
Showing 11 changed files with 105 additions and 42 deletions.
39 changes: 27 additions & 12 deletions images/comments/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def __init__(self, channel_id, outdir='data', cached_videos=False,
self.youtube_api_key = os.environ['YOUTUBE_API_KEY']
self.api = build('youtube', 'v3', developerKey=self.youtube_api_key)

os.makedirs(self.comments_path, exist_ok=True)
try:
os.makedirs(self.comments_path, exist_ok=True)
except PermissionError: # NFS
pass

@property
def video_path(self):
Expand Down Expand Up @@ -177,24 +180,36 @@ def run(self):
def parse_args():
parser = argparse.ArgumentParser(description='Fetch YouTube video/comment data')

parser.add_argument('channel_id', nargs='?', default=None, help='YouTube Channel ID')
parser.add_argument('channel_id', nargs='?', default=None,
help='YouTube Channel ID')
parser.add_argument('--outdir', '-o', default=None,
help='Output directory')

parser.add_argument('--cached_videos', '-c', action='store_true',
help='Avoid fetching new videos (comments only)')
parser.add_argument('--verbose', '-d', action='store_true',
help='Debug output')
parser.add_argument('--progress', '-p', action='store_true',
help='Progress bar')
parser.add_argument('--new-threshold-days', '-n', default=None,
help='Always refresh comments for videos newer than this (in days)')

parser.add_argument('--cached_videos', '-c', action='store_true', help='Avoid fetching new videos (comments only)')
parser.add_argument('--outdir', '-o', default='data', help='Output directory')
parser.add_argument('--verbose', '-d', action='store_true', help='Debug output')
parser.add_argument('--progress', '-p', action='store_true', help='Progress bar')
parser.add_argument('--new-threshold-days', '-n', default=None, help='Always refresh comments for videos newer than this (in days)')
args = parser.parse_args()

if args.channel_id is None:
assert 'CHANNEL_ID' in os.environ.keys()
setattr(args, 'channel_id', os.environ.get('CHANNEL_ID'))

return parser.parse_args()
if args.outdir is None:
outdir = os.getenv('DATA_DIR', 'data')
setattr(args, 'outdir', outdir)

return args


if __name__ == '__main__':
args = vars(parse_args())

ut.log_setup('DEBUG' if args.pop('verbose') else None)

if args['channel_id'] is None:
assert 'CHANNEL_ID' in os.environ.keys()
args['channel_id'] = os.environ.get('CHANNEL_ID')

ChannelFetch(**args).run()
17 changes: 13 additions & 4 deletions images/sentiment-topic/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ def __init__(self, data, output_dir, outfile_name='sentiment-scores.csv',
self.autocast = autocast
self.data_parallel = data_parallel

os.makedirs(output_dir, exist_ok=True)
try:
os.makedirs(output_dir, exist_ok=True)
except PermissionError: # NFS
pass

self._cache = self._load_cache()

Expand Down Expand Up @@ -160,15 +163,21 @@ def process(self):
seed = int(os.environ.get('SEED', '42'))
ut.seed_everything(seed)

os.makedirs('data', exist_ok=True)
data_dir = os.getenv('DATA_DIR', 'data')
output_dir = os.path.join(data_dir, 'comments')

try:
os.makedirs(data_dir, exist_ok=True)
except PermissionError: # NFS
pass

data = load_comments_from_json(
rootpath='data/',
rootpath=data_dir,
channel_id=os.getenv('CHANNEL_ID', None),
playlist_id=os.getenv('PLAYLIST_ID', None),
)

for d in data:
d.pop('video_id')

SentimentAnalyzer(data=data, output_dir='data/comments/').process()
SentimentAnalyzer(data=data, output_dir=output_dir).process()
26 changes: 18 additions & 8 deletions images/sentiment-topic/topic-embeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,15 @@ def __init__(self, data, output_dir, cache_dir='sentence-embeds-cache',
self.sort_length = sort_length
self.autocast = autocast

os.makedirs(output_dir, exist_ok=True)
os.makedirs(self._cache_path, exist_ok=True)
try:
os.makedirs(output_dir, exist_ok=True)
except PermissionError: # NFS
pass

try:
os.makedirs(self._cache_path, exist_ok=True)
except PermissionError:
pass

self._cached = self._load_cache_state()
self.data = [r for r in self.data if not self._is_cached(r['id'])]
Expand Down Expand Up @@ -190,10 +197,16 @@ def process(self):
seed = int(os.environ.get('SEED', '42'))
ut.seed_everything(seed)

os.makedirs('data', exist_ok=True)
data_dir = os.getenv('DATA_DIR', 'data')
output_dir = os.path.join(data_dir, 'comment-topics')

try:
os.makedirs(data_dir, exist_ok=True)
except PermissionError: # NFS
pass

data = load_comments_from_json(
rootpath='data/',
rootpath=data_dir,
channel_id=os.getenv('CHANNEL_ID', None),
playlist_id=os.getenv('PLAYLIST_ID', None),
full_only=True,
Expand All @@ -202,7 +215,4 @@ def process(self):
for d in data:
d.pop('video_id')

SentenceEmbedder(
data=data,
output_dir='data/comment-topics/',
).process()
SentenceEmbedder(data=data, output_dir=output_dir).process()
11 changes: 8 additions & 3 deletions images/sentiment-topic/topic-hdbscan-predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,19 @@
seed = int(os.environ.get('SEED', '42'))
ut.seed_everything(seed)

with open('data/comment-topics/umap-embeds-50d.npy', 'rb') as f:
data_dir = os.getenv('DATA_DIR', 'data')

path_u50d = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
with open(path_u50d, 'rb') as f:
umap_embeds_50d = np.load(f)

with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'rb') as f:
path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl')
with open(path_c50d, 'rb') as f:
clusterer = pickle.load(f)

labels = approximate_predict(clusterer, umap_embeds_50d)[0]

with ut.DelayedKeyboardInterrupt():
with open('data/comment-topics/hdbscan-labels-umap-50d.npy', 'wb') as f:
path_l50d = os.path.join(data_dir, 'comment-topics/hdbscan-labels-umap-50d.npy')
with open(path_l50d, 'wb') as f:
np.save(f, labels)
7 changes: 5 additions & 2 deletions images/sentiment-topic/topic-hdbscan-train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
seed = int(os.environ.get('SEED', '42'))
ut.seed_everything(seed)

umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy'
data_dir = os.getenv('DATA_DIR', 'data')

umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
with open(umap_embeds_50d_path, 'rb') as f:
umap_embeds_50d = np.load(f)

Expand All @@ -38,5 +40,6 @@
clusterer.fit(umap_embeds_50d)

with ut.DelayedKeyboardInterrupt():
with open('data/comment-topics/hdbscan-clusterer-umap-50d.pkl', 'wb') as f:
path_c50d = os.path.join(data_dir, 'comment-topics/hdbscan-clusterer-umap-50d.pkl')
with open(path_c50d, 'wb') as f:
pickle.dump(clusterer, f)
14 changes: 9 additions & 5 deletions images/sentiment-topic/topic-umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
seed = int(os.environ.get('SEED', '42'))
ut.seed_everything(seed)

data_dir = os.getenv('DATA_DIR', 'data')

#
# Sample to train on
#

sample = load_comments_from_json(
rootpath='data/',
rootpath=data_dir,
channel_id=os.getenv('CHANNEL_ID', None),
playlist_id=os.getenv('PLAYLIST_ID', None),
)
Expand All @@ -42,20 +44,21 @@
.tolist()
sample = set(sample)

ids = pd.read_csv('data/comment-topics/sentence-embeds-ids.csv')['id']
embeds_ids_path = os.path.join(data_dir, 'comment-topics/sentence-embeds-ids.csv')
ids = pd.read_csv(embeds_ids_path)['id']
train_mask = np.asarray([i in sample for i in ids.to_numpy().tolist()])
logger.info(f'Training on {train_mask.sum()} samples')

ids.loc[train_mask].to_csv('data/comment-topics/umap-hdbscan-sample-ids.csv', index=False)
sample_ids_path = os.path.join(data_dir, 'comment-topics/umap-hdbscan-sample-ids.csv')
ids.loc[train_mask].to_csv(sample_ids_path, index=False)

#
# 50d UMAP
#

logger.info('50d UMAP')
umap_embeds_50d_path = 'data/comment-topics/umap-embeds-50d.npy'

embeds_file = 'data/comment-topics/sentence-embeds.pt'
embeds_file = os.path.join(data_dir, 'comment-topics/sentence-embeds.pt')
with open(embeds_file, 'rb') as obj:
embeds = torch.load(obj, 'cpu').float().numpy()[train_mask, ...]

Expand All @@ -76,5 +79,6 @@
umap_embeds_50d = umap_model_50d.transform(embeds)

with ut.DelayedKeyboardInterrupt():
umap_embeds_50d_path = os.path.join(data_dir, 'comment-topics/umap-embeds-50d.npy')
with open(umap_embeds_50d_path, 'wb') as f:
np.save(f, umap_embeds_50d)
4 changes: 1 addition & 3 deletions images/streamlit/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@

logger = logging.getLogger(__name__)


DB_PATH = 'data/youtube.db'
SUMMARY_SUGGEST_CACHE_DIR = './data/summary-suggest-gpt'
DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths()


def stmt_to_pandas(stmt, index_col=None):
Expand Down
6 changes: 5 additions & 1 deletion images/streamlit/file_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ def __init__(self, cache_dir=None, **kwargs):
_cache_keys = []

if cache_dir is not None:
os.makedirs(cache_dir, exist_ok=True)
try:
os.makedirs(cache_dir, exist_ok=True)
except PermissionError: # NFS
pass

_cache_keys += os.listdir(cache_dir)

self.cache_dir = cache_dir
Expand Down
6 changes: 4 additions & 2 deletions images/streamlit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
import utils as ut
from app import run


logger = logging.getLogger(__name__)


if __name__ == '__main__':
os.makedirs('data', exist_ok=True)
try:
os.makedirs(ut.get_data_paths[0], exist_ok=True)
except PermissionError: # NFS
pass

ut.log_setup()
ut.seed_everything()
Expand Down
8 changes: 6 additions & 2 deletions images/streamlit/refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@

import utils as ut
from models import get_db
from app import DB_PATH, SUMMARY_SUGGEST_CACHE_DIR
from video_text2text import VideoSummary, VideoSuggestions, ClusterShortName


logger = logging.getLogger(__name__)


if __name__ == '__main__':
os.makedirs('data', exist_ok=True)
DATA_DIR, DB_PATH, SUMMARY_SUGGEST_CACHE_DIR = ut.get_data_paths()

try:
os.makedirs(DATA_DIR, exist_ok=True)
except PermissionError: # NFS
pass

ut.log_setup()
ut.seed_everything()
Expand Down
9 changes: 9 additions & 0 deletions images/streamlit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ def seed_everything():
np.random.seed(seed)


def get_data_paths():
data_dir = os.getenv('DATA_DIR', 'data')

db_path = os.path.join(data_dir, 'youtube.db')
cache_dir = os.path.join(data_dir, 'summary-suggest-gpt')

return db_path, cache_dir


def remove_punctuation(text):
for punct in string.punctuation:
if punct == "'":
Expand Down

0 comments on commit ae9d201

Please sign in to comment.