Skip to content

Commit

Permalink
don't crawl already processed videos
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Sep 29, 2024
1 parent 16beed9 commit 326d1ae
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions multi_crawler/crawlers/youtube_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
self.futures = set()

self._search = Search(terms, {"params": "EgIwAQ%3D%3D"})
self._videos = set()

def _manage_futures(self):
"""Helper function to clean up completed futures and maintain a max of 10 threads."""
Expand All @@ -49,6 +50,10 @@ def _manage_futures(self):
self.futures.remove(fut)

def _get_ytb_data(self, url):
# check if the video has already been processed
if url in self._videos:
return

# get the oldest session
session = self._ytb_sessions.pop(min(self._ytb_sessions.keys()))
# append a new session
Expand All @@ -70,6 +75,7 @@ def _get_ytb_data(self, url):
)

self._callback(audio)
self._videos.add(url)

def crawl(self, *args, **kwargs) -> None:
"""Find and return URLs of Youtube videos based on search terms."""
Expand Down

0 comments on commit 326d1ae

Please sign in to comment.